|
|
|
|
| model:
|
| name: "Byte Dream"
|
| version: "1.0.0"
|
|
|
|
|
| unet:
|
| in_channels: 4
|
| out_channels: 4
|
| block_out_channels: [128, 256, 512, 512]
|
| layers_per_block: 1
|
| attention_head_dim: 4
|
| cross_attention_dim: 512
|
| use_linear_projection: false
|
|
|
| scheduler:
|
| name: "DDIM"
|
| num_train_timesteps: 1000
|
| beta_start: 0.00085
|
| beta_end: 0.012
|
| beta_schedule: "scaled_linear"
|
| clip_sample: false
|
| set_alpha_to_one: false
|
|
|
| vae:
|
| in_channels: 3
|
| out_channels: 3
|
| down_block_types: ["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"]
|
| up_block_types: ["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"]
|
| latent_channels: 4
|
| sample_size: 512
|
|
|
| block_out_channels: [64, 128, 256, 256]
|
|
|
| text_encoder:
|
| model: "openai/clip-vit-base-patch32"
|
| max_length: 77
|
|
|
|
|
| generation:
|
| width: 512
|
| height: 512
|
| num_inference_steps: 50
|
| guidance_scale: 7.5
|
| negative_prompt: "ugly, blurry, low quality, distorted, deformed"
|
| seed: null
|
|
|
|
|
| cpu_optimization:
|
| use_openvino: false
|
| use_onnx: false
|
| precision: "fp32"
|
| threads: -1
|
| memory_limit: null
|
|
|
|
|
| memory_optimization:
|
| use_gradient_checkpointing: true
|
| mixed_precision: "fp16"
|
| attention_slicing: true
|
|
|
|
|
| training:
|
| dataset_path: "./dataset"
|
| output_dir: "./models/bytedream"
|
| epochs: 100
|
| batch_size: 1
|
| gradient_accumulation_steps: 4
|
| learning_rate: 0.00001
|
| lr_scheduler: "constant_with_warmup"
|
| lr_warmup_steps: 500
|
| max_grad_norm: 1.0
|
| mixed_precision: "no"
|
|
|
|
|
| random_flip: true
|
| random_crop: false
|
| center_crop: true
|
|
|
|
|
| logging_dir: "./logs"
|
| log_every_n_steps: 10
|
|
|
|
|
| huggingface:
|
| organization: ""
|
| private: false
|
| push_to_hub: true
|
|
|