Spaces:
Running
Running
| import torch as th | |
| def get_score(input_embs, label_ids, model_control, t=None): | |
| label_ids2 = label_ids.clone() | |
| label_ids2[:, :65] = -100 | |
| # print(label_ids2[:, 65:]) | |
| # print(final.shape, tgt_embs.shape) | |
| # input_embs = th.cat([final, tgt_embs], dim=1) | |
| model_out = model_control(input_embs=input_embs, | |
| labels=label_ids2, t=t) | |
| print(model_out.loss, 'final end') | |
| loss_fn = th.nn.CrossEntropyLoss(reduction='none') | |
| shifted_logits = model_out.logits[:, :-1].contiguous() | |
| shifted_labels = label_ids2[:, 1:].contiguous() | |
| loss = loss_fn(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1)).reshape( | |
| shifted_labels.shape) | |
| return loss.sum(dim=-1).tolist() | |
| def langevin_fn3(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 3 | |
| # K = 3 | |
| if t[0].item() > 0: | |
| tt = t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| label_ids = label_ids.cuda() | |
| tgt_embs = model3(label_ids[:, sample.size(1):]) | |
| label_ids2 = label_ids.clone() | |
| label_ids2[:, :65] = -100 | |
| input_embs_param = th.nn.Parameter(sample) | |
| if False: | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt)) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| model_out = model_control(input_embs=input_embs, | |
| labels=label_ids2, t=tt) | |
| coef = 0.01 | |
| # coef=1. | |
| if sigma.mean() == 0: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| else: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum() | |
| # print(model_out.loss, f'start_{i}', logp_term.item(), t[0].item(), sigma.mean().item()) | |
| loss = model_out.loss + logp_term | |
| loss.backward() | |
| optimizer.step() | |
| epsilon = th.randn_like(input_embs_param.data) | |
| input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach()) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + | |
| # np.sqrt(2*sigma.mean().item()) * epsilon).detach()) | |
| # input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| # model_out = model_control(input_embs=input_embs, | |
| # labels=label_ids2, | |
| # t=tt) | |
| # print(model_out.loss, 'end') | |
| return input_embs_param.data | |
| def langevin_fn4(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 3 | |
| if t[0].item() >0: | |
| tt =t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| label_ids = label_ids.cuda() | |
| input_embs_param = th.nn.Parameter(sample) | |
| if False: | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt)) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| # print(input_embs_param.shape, label_ids.shape) | |
| model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt) | |
| coef = 0.0001 # prev default. | |
| # coef = 0.001 | |
| # coef = 0.0005 | |
| # coef=1. | |
| if sigma.mean() == 0: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| else: | |
| logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum() | |
| print(model_out.loss, f'start_{i}', logp_term.item(), | |
| t[0].item(), sigma.mean().item()) | |
| loss = model_out.loss + logp_term | |
| loss.backward() | |
| optimizer.step() | |
| epsilon = th.randn_like(input_embs_param.data) | |
| input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach()) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + | |
| # np.sqrt(2*sigma.mean().item()) * epsilon).detach()) | |
| model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt) | |
| print(model_out.loss, 'end') | |
| return input_embs_param.data | |
| def langevin_fn_length(coeff, diffusion, partial_mask, diff_model, tgt_embs, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 3 | |
| if t[0].item() >0: | |
| tt =t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| input_embs_param = th.nn.Parameter(sample) | |
| if False: | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt)) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| print(t.shape) | |
| # print(input_embs_param.shape, label_ids.shape) | |
| out = diffusion.p_mean_variance( | |
| diff_model, | |
| input_embs_param, | |
| t, | |
| clip_denoised=False, | |
| denoised_fn=None, | |
| model_kwargs={}, | |
| ) | |
| # model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt) | |
| coef = coeff | |
| # coef = 0.0001 # prev default. | |
| # coef = 0.001 | |
| # coef = 0.0005 | |
| # coef=1. | |
| if sigma.mean() == 0: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| infill_loss = (out['pred_xstart'][~partial_mask] - tgt_embs[~partial_mask]) ** 2 | |
| infill_loss = infill_loss.mean(dim=0).sum() | |
| else: | |
| logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum() | |
| # print(out['pred_xstart'].shape, tgt_embs.shape) | |
| # print(partial_mask[0]) | |
| infill_loss = ((out['pred_xstart'][~partial_mask] - tgt_embs[~partial_mask]) ** 2).view(tgt_embs.size(0), -1, tgt_embs.size(-1) ) | |
| # print(infill_loss.shape, ((mean - input_embs_param)**2).shape ) | |
| infill_loss = (infill_loss/sigma.mean()).mean(dim=0).sum() | |
| print(infill_loss, f'start_{i}', logp_term.item(), | |
| t[0].item(), sigma.mean().item()) | |
| loss = logp_term + infill_loss | |
| loss.backward() | |
| optimizer.step() | |
| epsilon = th.randn_like(input_embs_param.data) | |
| input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach()) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + | |
| # np.sqrt(2*sigma.mean().item()) * epsilon).detach()) | |
| # model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt) | |
| # print(model_out.loss, 'end') | |
| return input_embs_param.data | |
| def langevin_fn_tree(coeff, model_control, model3, label_ids, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 3 | |
| if t[0].item() >0: | |
| tt =t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| label_ids = label_ids.cuda() | |
| input_embs_param = th.nn.Parameter(sample) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| # print(input_embs_param.shape, label_ids.shape) | |
| model_out = model_control(input_embs=input_embs_param, parse_chart=label_ids, t=tt) | |
| # coef = 0.0001 | |
| # coef = 0.001 | |
| # coef = 0.01 | |
| # coef = 0.1 # good for partial. | |
| # coef=0.001 # also good for full (more fluent). | |
| # coef=0.0001 | |
| # coef=0.0005 # good for full. | |
| coef = coeff | |
| # coef = 0.5 | |
| # coef=1. | |
| if sigma.mean() == 0: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| else: | |
| logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum() | |
| # print(model_out.loss, f'start_{i}', logp_term.item(), | |
| # t[0].item(), sigma.mean().item()) | |
| loss = model_out.loss + logp_term | |
| loss.backward() | |
| optimizer.step() | |
| epsilon = th.randn_like(input_embs_param.data) | |
| input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach()) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + | |
| # np.sqrt(2*sigma.mean().item()) * epsilon).detach()) | |
| # COMMENT OUT | |
| # model_out = model_control(input_embs=input_embs_param, parse_chart=label_ids, t=tt) | |
| # print(model_out.loss, 'end') | |
| return input_embs_param.data | |
| def langevin_fn1(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 1 | |
| # K = 3 | |
| if t[0].item() > 0: | |
| tt = t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| label_ids = label_ids.cuda() | |
| tgt_embs = model3(label_ids[:, sample.size(1):]) | |
| label_ids2 = label_ids.clone() | |
| label_ids2[:, :65] = -100 | |
| input_embs_param = th.nn.Parameter(sample) | |
| if True: | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt)) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| model_out = model_control(input_embs=input_embs, | |
| labels=label_ids2, t=tt) | |
| # coef = 0.0 | |
| # if sigma.mean() == 0: | |
| # logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| # else: | |
| # logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum() | |
| print(model_out.loss, f'start_{i}', t[0].item(), sigma.mean().item()) | |
| coef = 3. | |
| loss = model_out.loss # + logp_term | |
| loss.backward() | |
| # print(input_embs_param.grad.shape, ) | |
| input_embs_param.data = input_embs_param.data - coef * sigma.mean().item() * input_embs_param.grad | |
| # optimizer.step() | |
| # epsilon = th.randn_like(input_embs_param.data) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach()) | |
| # input_embs_param = th.nn.Parameter((input_embs_param.data + | |
| # np.sqrt(2*sigma.mean().item()) * epsilon).detach()) | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| model_out = model_control(input_embs=input_embs, | |
| labels=label_ids2, | |
| t=tt) | |
| print(model_out.loss, 'end') | |
| # if True: | |
| # debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt)) | |
| return input_embs_param.data | |
| def langevin_fn3_compose(debug_lst, model_control, model3, label_ids_lst, step_size, sample, mean, sigma, | |
| alpha, t, prev_sample): # current best. | |
| if t[0].item() < 10: | |
| K = 0 | |
| else: | |
| K = 3 | |
| # K = 3 | |
| if t[0].item() > 0: | |
| tt = t[0].item() - 1 | |
| else: | |
| tt = 200 | |
| tgt_embs_lst = [model3(label_ids[:, sample.size(1):]) for label_ids in label_ids_lst] | |
| label_ids2_lst = [] | |
| for label_ids in label_ids_lst: | |
| label_ids2 = label_ids.clone() | |
| label_ids2[:, :65] = -100 | |
| label_ids2_lst.append(label_ids2) | |
| input_embs_param = th.nn.Parameter(sample) | |
| if True: | |
| part_score = [] | |
| for (tgt_embs,label_ids2) in zip(tgt_embs_lst, label_ids2_lst): | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| score_ = get_score(input_embs, label_ids2, model_control, t=tt) | |
| part_score.append(score_) | |
| debug_lst.append(part_score) | |
| with th.enable_grad(): | |
| for i in range(K): | |
| optimizer = th.optim.Adagrad([input_embs_param], lr=step_size) | |
| optimizer.zero_grad() | |
| cum_loss = 0 | |
| for (tgt_embs, label_ids2) in zip(tgt_embs_lst, label_ids2_lst): | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| model_out = model_control(input_embs=input_embs, | |
| labels=label_ids2, t=tt) | |
| cum_loss += model_out.loss | |
| coef = 0.01 | |
| if sigma.mean() == 0: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum() | |
| else: | |
| logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum() | |
| print(cum_loss, f'start_{i}', logp_term.item(), t[0].item(), sigma.mean().item()) | |
| loss = cum_loss + logp_term | |
| loss.backward() | |
| optimizer.step() | |
| epsilon = th.randn_like(input_embs_param.data) | |
| input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach()) | |
| part_score = [] | |
| for (tgt_embs, label_ids2) in zip(tgt_embs_lst, label_ids2_lst): | |
| input_embs = th.cat([input_embs_param, tgt_embs], dim=1) | |
| score_ = get_score(input_embs, label_ids2, model_control, t=tt) | |
| part_score.append(score_) | |
| return input_embs_param.data | |