| name: cc12m_256x256 |
| dataset_config: configs/datasets/cc12m.yaml |
| |
| min_examples: 10000 |
| sample-dir: /mnt/data/samples |
| |
| sample_image_size: 256 |
| test_file_list: validation.tsv |
| |
| |
| output_dir: /mnt/data/outputs |
| num_diffusion_steps: 1000 |
| reproject_signal: false |
| model_output_scale: 0 |
| prediction_type: V_PREDICTION |
| loss_target_type: DDPM |
| schedule_type: DEEPFLOYD |
| prediction_length: 129 |
| use_vdm_loss_weights: false |
| use_double_loss: true |
| no_use_residual: true |
| num_training_steps: 1000000 |
| avg_lm_steps: 0 |
| categorical_conditioning: 0 |
| rescale_signal: 1 |
| schedule_shifted: true |
| skip_normalization: true |
| random_low_noise: true |
| vocab_file: t5.vocab |
| text_model: google/flan-t5-xl |
| model: nested_unet |
| vision_model: nested_unet |
| |
| unet_config: |
| attention_levels: [] |
| conditioning_feature_dim: -1 |
| conditioning_feature_proj_dim: -1 |
| freeze_inner_unet: false |
| initialize_inner_with_pretrained: None |
| inner_config: |
| attention_levels: [1, 2] |
| conditioning_feature_dim: -1 |
| conditioning_feature_proj_dim: 2048 |
| masked_cross_attention: 0 |
| micro_conditioning: scale:64 |
| nesting: true |
| num_attention_layers: [0, 1, 5] |
| num_lm_head_layers: 0 |
| num_resnets_per_resolution: [2, 2, 2] |
| num_temporal_attention_layers: null |
| resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1, |
| use_attention_ffn: true} |
| resolution_channels: [256, 512, 768] |
| skip_cond_emb: false |
| skip_mid_blocks: false |
| temporal_dim: null |
| temporal_mode: false |
| temporal_positional_encoding: false |
| temporal_spatial_ds: false |
| interp_conditioning: false |
| masked_cross_attention: 1 |
| micro_conditioning: scale:256 |
| nesting: false |
| num_attention_layers: [0, 0, 0] |
| num_lm_head_layers: 0 |
| num_resnets_per_resolution: [2, 2, 1] |
| num_temporal_attention_layers: null |
| resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1, |
| use_attention_ffn: false} |
| resolution_channels: [64, 128, 256] |
| skip_cond_emb: true |
| skip_inner_unet_input: false |
| skip_mid_blocks: true |
| skip_normalization: true |
| temporal_dim: 1024 |
| temporal_mode: false |
| temporal_positional_encoding: false |
| temporal_spatial_ds: false |
|
|
| reader_config: |
| image_size: 256 |
| smaller_side_size: 256 |
| random_crop: false |
| max_caption_length: -1 |
| max_token_length: 128 |
| reader_buffer_size: 2000 |
| shuffle_buffer_size: 2000 |
|
|
| append_eos: true |
| num_readers: 2 |
| pad_to_max_length: false |
| padding_token: <pad> |
| prepad_bos: false |
| prepad_caption_with_space: true |
| random_crop: false |
| |
| |
| use_tokenizer_scores: true |
|
|
| use_lm_mask: 1 |
| |
| metrics: fid,clip |
| |
| use_precomputed_text_embeddings: 0 |
| pretrained_vision_file: vis_model_256x256.pth |
| |
| mixed_ratio: '2:1' |
| gradient_clip_norm: 2 |
| loss_factor: 1 |
| num_gradient_accumulations: 1 |
| warmup_steps: 10000 |
| |
| log_freq: 50 |
| save_freq: 5000 |
| lr: 5.0e-05 |
| fp16: 0 |
|
|