conditions: [lineart, edge, depth, normal, albedo, segmentation, openpose]
# model config
model:
  model: Jodi_1600M_P1_D20
  image_size: 1024
  mixed_precision: bf16
  fp32_attention: true
  load_from:
  resume_from:
  pe_interpolation: 1.
  attn_type: linear
  mlp_acts:
    - silu
    - silu
    -
  mlp_ratio: 2.5
  use_pe: true
  qk_norm: false
  class_dropout_prob: 0.1
# VAE setting
vae:
  vae_type: dc-ae
  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
  scale_factor: 0.41407
  vae_latent_dim: 32
  vae_downsample_rate: 32
  sample_posterior: true
# text encoder
text_encoder:
  text_encoder_name: gemma-2-2b-it
  y_norm: true
  y_norm_scale_factor: 0.01
  model_max_length: 300
  # CHI
  chi_prompt:
    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
    - 'Here are examples of how to transform or refine prompts:'
    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
    - 'User Prompt: '
# Sana schedule Flow
scheduler:
  predict_v: true
  noise_schedule: linear_flow
  flow_shift: 3.0
  # logit-normal timestep
  weighting_scheme: logit_normal
  logit_mean: 0.0
  logit_std: 1.0