| image_size: 224 | |
| path_svg: ~ # if you want to load a svg file and train from it | |
| mask_object: False # if the target image contains background, it's better to mask it out | |
| fix_scale: False # if the target image is not squared, it is recommended to fix the scale | |
| # train | |
| num_iter: 500 | |
| batch_size: 1 | |
| num_stages: 1 # training stages, you can train x strokes, then freeze them and train another x strokes etc | |
| lr_scheduler: False | |
| lr_decay_rate: 0.1 | |
| decay_steps: [ 1000, 1500 ] | |
| lr: 1 # point lr | |
| color_lr: 0.01 | |
| color_vars_threshold: 0.1 | |
| width_lr: 0.1 # stroke width lr | |
| max_width: 50 # stroke width | |
| # stroke attrs | |
| num_paths: 128 # number of strokes | |
| width: 3 # init stroke width | |
| control_points_per_seg: 4 | |
| num_segments: 1 | |
| optim_opacity: True # if True, the stroke opacity is optimized | |
| optim_width: True # if True, the stroke width is optimized | |
| optim_rgba: False # if True, the stroke RGBA is optimized | |
| opacity_delta: 0 # stroke pruning | |
| # init strokes | |
| attention_init: True # if True, use the attention heads of Dino model to set the location of the initial strokes | |
| xdog_intersec: True # initialize along the edge, mix XDoG and attn up | |
| softmax_temp: 0.5 # the temperature of softmax | |
| cross_attn_res: 16 # cross attn resolution | |
| self_attn_res: 32 # self-attn resolution | |
| max_com: 20 # select the number of the self-attn maps | |
| mean_comp: False # the average of the self-attn maps | |
| comp_idx: 0 # if mean_comp==False, indicates the index of the self-attn map | |
| attn_coeff: 1.0 # attn fusion, w * cross-attn + (1-w) * self-attn | |
| log_cross_attn: False | |
| u2net_path: "./checkpoint/u2net/u2net.pth" | |
| # ldm | |
| model_id: "sd15" # stable diffusion V1.5 | |
| ldm_speed_up: False | |
| enable_xformers: True # speed up attn compute | |
| gradient_checkpoint: False # this slows down the code, but saves GPU VRAM | |
| token_ind: 1 # the index of CLIP prompt embedding, start from 1, 0 is start token | |
| use_ddim: True | |
| num_inference_steps: 100 | |
| guidance_scale: 7.5 | |
| # ASDS loss | |
| sds: | |
| crop_size: 512 | |
| augmentations: "affine" | |
| guidance_scale: 100 | |
| grad_scale: 1e-5 | |
| t_range: [ 0.05, 0.95 ] | |
| warmup: 2000 | |
| # JVSP | |
| clip: | |
| model_name: "RN101" # RN101, ViT-L/14 | |
| feats_loss_type: "l2" # clip visual loss type, conv layers | |
| feats_loss_weights: [ 0,0,1.0,1.0,0 ] # RN based | |
| # feats_loss_weights: [ 0,0,1.0,1.0,0,0,0,0,0,0,0,0 ] # ViT based | |
| fc_loss_weight: 0.1 # clip visual loss, fc layer weight | |
| augmentations: "affine_norm" # augmentation before clip visual computation, affine_norm_trivial | |
| num_aug: 4 # num of augmentation before clip visual computation | |
| vis_loss: 1 # 1 or 0 for use or disable clip visual loss | |
| text_visual_coeff: 0 # cosine similarity between text and img | |
| perceptual: | |
| name: "lpips" # dists | |
| lpips_net: 'vgg' | |
| coeff: 0.2 |