jree423
/

diffsketcher

vector-graphics

Model card Files Files and versions

diffsketcher / config /diffsketcher-width.yaml

jree423's picture

Upload folder using huggingface_hub

51e82cd verified 7 months ago

history blame contribute delete

2.69 kB

	image_size: 224
	path_svg: ~ # if you want to load a svg file and train from it
	mask_object: False # if the target image contains background, it's better to mask it out
	fix_scale: False # if the target image is not squared, it is recommended to fix the scale

	# train
	num_iter: 500
	batch_size: 1
	num_stages: 1 # training stages, you can train x strokes, then freeze them and train another x strokes etc
	lr_scheduler: False
	lr_decay_rate: 0.1
	decay_steps: [ 1000, 1500 ]
	lr: 1 # point lr
	color_lr: 0.01
	color_vars_threshold: 0.1
	width_lr: 0.1 # stroke width lr
	max_width: 50 # stroke width

	# stroke attrs
	num_paths: 128 # number of strokes
	width: 3 # init stroke width
	control_points_per_seg: 4
	num_segments: 1
	optim_opacity: True # if True, the stroke opacity is optimized
	optim_width: True # if True, the stroke width is optimized
	optim_rgba: False # if True, the stroke RGBA is optimized
	opacity_delta: 0 # stroke pruning

	# init strokes
	attention_init: True # if True, use the attention heads of Dino model to set the location of the initial strokes
	xdog_intersec: True # initialize along the edge, mix XDoG and attn up
	softmax_temp: 0.5 # the temperature of softmax
	cross_attn_res: 16 # cross attn resolution
	self_attn_res: 32 # self-attn resolution
	max_com: 20 # select the number of the self-attn maps
	mean_comp: False # the average of the self-attn maps
	comp_idx: 0 # if mean_comp==False, indicates the index of the self-attn map
	attn_coeff: 1.0 # attn fusion, w * cross-attn + (1-w) * self-attn
	log_cross_attn: False
	u2net_path: "./checkpoint/u2net/u2net.pth"

	# ldm
	model_id: "sd15" # stable diffusion V1.5
	ldm_speed_up: False
	enable_xformers: True # speed up attn compute
	gradient_checkpoint: False # this slows down the code, but saves GPU VRAM
	token_ind: 1 # the index of CLIP prompt embedding, start from 1, 0 is start token
	use_ddim: True
	num_inference_steps: 100
	guidance_scale: 7.5

	# ASDS loss
	sds:
	crop_size: 512
	augmentations: "affine"
	guidance_scale: 100
	grad_scale: 1e-5
	t_range: [ 0.05, 0.95 ]
	warmup: 2000

	# JVSP
	clip:
	model_name: "RN101" # RN101, ViT-L/14
	feats_loss_type: "l2" # clip visual loss type, conv layers
	feats_loss_weights: [ 0,0,1.0,1.0,0 ] # RN based
	# feats_loss_weights: [ 0,0,1.0,1.0,0,0,0,0,0,0,0,0 ] # ViT based
	fc_loss_weight: 0.1 # clip visual loss, fc layer weight
	augmentations: "affine_norm" # augmentation before clip visual computation, affine_norm_trivial
	num_aug: 4 # num of augmentation before clip visual computation
	vis_loss: 1 # 1 or 0 for use or disable clip visual loss
	text_visual_coeff: 0 # cosine similarity between text and img
	perceptual:
	name: "lpips" # dists
	lpips_net: 'vgg'
	coeff: 0.2