vijayakumaran92's picture
Add files using upload-large-folder tool
dfaef4b verified
ENV:
BACKEND: nccl
SEED: 1999
SOLVER:
# NAME DESCRIPTION: TYPE: default: 'LatentUfitSolver'
NAME: FormalACEPlusSolver
# MAX_STEPS DESCRIPTION: The total steps for training. TYPE: int default: 100000
MAX_STEPS: 100000
# USE_AMP DESCRIPTION: Use amp to surpport mix precision or not, default is False. TYPE: bool default: False
USE_AMP: True
# DTYPE DESCRIPTION: The precision for training. TYPE: str default: 'float32'
DTYPE: bfloat16
ENABLE_GRADSCALER: False
# USE_FAIRSCALE DESCRIPTION: Use fairscale as the backend of ddp, default False. TYPE: bool default: False
USE_FAIRSCALE: False
USE_ORIG_PARAMS: True
USE_FSDP: True # lora use ddp(USE_FSDP=False), else use fsdp(USE_FSDP=True)
# LOAD_MODEL_ONLY DESCRIPTION: Only load the model rather than the optimizer and schedule, default is False. TYPE: bool default: False
LOAD_MODEL_ONLY: False
# RESUME_FROM DESCRIPTION: Resume from some state of training! TYPE: str default: ''
RESUME_FROM:
# WORK_DIR DESCRIPTION: Save dir of the training log or model. TYPE: str default: ''
WORK_DIR: ./examples/exp_example/
# LOG_FILE DESCRIPTION: Save log path. TYPE: str default: ''
LOG_FILE: std_log.txt
# LOG_TRAIN_NUM DESCRIPTION: The number samples used to log in training phase. TYPE: int default: -1
LOG_TRAIN_NUM: 16
# FSDP_REDUCE_DTYPE DESCRIPTION: The dtype of reduce in FSDP. TYPE: str default: 'float16'
FSDP_REDUCE_DTYPE: float32
# FSDP_BUFFER_DTYPE DESCRIPTION: The dtype of buffer in FSDP. TYPE: str default: 'float16'
FSDP_BUFFER_DTYPE: float32
# FSDP_SHARD_MODULES DESCRIPTION: The modules to be sharded in FSDP. TYPE: list default: ['model']
FSDP_SHARD_MODULES:
- MODULE: 'model.model'
FSDP_GROUP: [ 'single_blocks', 'double_blocks']
- MODULE: 'cond_stage_model.t5_model.hf_module.encoder'
FSDP_GROUP: [ 'block' ] #
SAVE_MODULES: [ 'model'] #
TRAIN_MODULES: ['model']
#
FILE_SYSTEM:
- NAME: HuggingfaceFs
TEMP_DIR: ./cache
- NAME: ModelscopeFs
TEMP_DIR: ./cache
#
MODEL:
NAME: LatentDiffusionACEPlus
PARAMETERIZATION: rf
TIMESTEPS: 1000
GUIDE_SCALE: 1.0
PRETRAINED_MODEL:
IGNORE_KEYS: [ ]
USE_EMA: False
EVAL_EMA: False
SIZE_FACTOR: 8
DIFFUSION:
NAME: DiffusionFluxRF
PREDICTION_TYPE: raw
NOISE_NORM: True
# NOISE_SCHEDULER DESCRIPTION: TYPE: default: ''
NOISE_SCHEDULER:
NAME: FlowMatchFluxShiftScheduler
SHIFT: False
PRE_T_SAMPLE: True
PRE_T_SAMPLE_FOLD: 1
SIGMOID_SCALE: 1
BASE_SHIFT: 0.5
MAX_SHIFT: 1.15
SAMPLER_SCHEDULER:
NAME: FlowMatchFluxShiftScheduler
SHIFT: True
PRE_T_SAMPLE: False
SIGMOID_SCALE: 1
BASE_SHIFT: 0.5
MAX_SHIFT: 1.15
#
DIFFUSION_MODEL:
# NAME DESCRIPTION: TYPE: default: 'Flux'
NAME: FluxMRACEPlus
PRETRAINED_MODEL: /home/Ubuntu/Downloads/Unmodel/Reference_models/flux1-fill-dev.safetensors
# IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
IN_CHANNELS: 384
# OUT_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
OUT_CHANNELS: 64
# HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
HIDDEN_SIZE: 3072
REDUX_DIM: 1152
# NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
NUM_HEADS: 24
# AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
AXES_DIM: [ 16, 56, 56 ]
# THETA DESCRIPTION: theta for positional encoding. TYPE: int default: 10000
THETA: 10000
# VEC_IN_DIM DESCRIPTION: dimension of the vector input. TYPE: int default: 768
VEC_IN_DIM: 768
# GUIDANCE_EMBED DESCRIPTION: whether to use guidance embedding. TYPE: bool default: False
GUIDANCE_EMBED: True
# CONTEXT_IN_DIM DESCRIPTION: dimension of the context input. TYPE: int default: 4096
CONTEXT_IN_DIM: 4096
# MLP_RATIO DESCRIPTION: ratio of mlp hidden size to hidden size. TYPE: float default: 4.0
MLP_RATIO: 4.0
# QKV_BIAS DESCRIPTION: whether to use bias in qkv projection. TYPE: bool default: True
QKV_BIAS: True
# DEPTH DESCRIPTION: number of transformer blocks. TYPE: int default: 19
DEPTH: 19
# DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
DEPTH_SINGLE_BLOCKS: 38
# ATTN_BACKEND:setting 'flash_attn' to use flash_attn2, if the version of pytorch > 2.4.0, using 'pytorch' to use pytorch's implementation
ATTN_BACKEND: flash_attn
# USE_GRAD_CHECKPOINT: setting gc to true can decrease the memory usage.
USE_GRAD_CHECKPOINT: True
#
FIRST_STAGE_MODEL:
NAME: AutoencoderKLFlux
EMBED_DIM: 16
PRETRAINED_MODEL: /home/Ubuntu/Downloads/Unmodel/Reference_models/ae.safetensors
IGNORE_KEYS: [ ]
BATCH_SIZE: 8
USE_CONV: False
SCALE_FACTOR: 0.3611
SHIFT_FACTOR: 0.1159
#
ENCODER:
NAME: Encoder
CH: 128
OUT_CH: 3
NUM_RES_BLOCKS: 2
IN_CHANNELS: 3
ATTN_RESOLUTIONS: [ ]
CH_MULT: [ 1, 2, 4, 4 ]
Z_CHANNELS: 16
DOUBLE_Z: True
DROPOUT: 0.0
RESAMP_WITH_CONV: True
#
DECODER:
NAME: Decoder
CH: 128
OUT_CH: 3
NUM_RES_BLOCKS: 2
IN_CHANNELS: 3
ATTN_RESOLUTIONS: [ ]
CH_MULT: [ 1, 2, 4, 4 ]
Z_CHANNELS: 16
DROPOUT: 0.0
RESAMP_WITH_CONV: True
GIVE_PRE_END: False
TANH_OUT: False
#
COND_STAGE_MODEL:
# NAME DESCRIPTION: TYPE: default: 'T5PlusClipFluxEmbedder'
NAME: T5ACEPlusClipFluxEmbedder
# T5_MODEL DESCRIPTION: TYPE: default: ''
T5_MODEL:
# NAME DESCRIPTION: TYPE: default: 'HFEmbedder'
NAME: ACEHFEmbedder
# HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
HF_MODEL_CLS: T5EncoderModel
# MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
MODEL_PATH: /home/Ubuntu/Downloads/Unmodel/Reference_models/t5_xxl/
# HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
HF_TOKENIZER_CLS: T5Tokenizer
# TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
TOKENIZER_PATH: /home/Ubuntu/Downloads/Unmodel/Reference_models/t5_xxl/
ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
# MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
MAX_LENGTH: 512
# OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
OUTPUT_KEY: last_hidden_state
# D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
D_TYPE: bfloat16
# BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
BATCH_INFER: False
CLEAN: whitespace
# CLIP_MODEL DESCRIPTION: TYPE: default: ''
CLIP_MODEL:
# NAME DESCRIPTION: TYPE: default: 'HFEmbedder'
NAME: ACEHFEmbedder
# HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
HF_MODEL_CLS: CLIPTextModel
# MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
MODEL_PATH: /home/Ubuntu/Downloads/Unmodel/Reference_models/clip_l/
# HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
HF_TOKENIZER_CLS: CLIPTokenizer
# TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
TOKENIZER_PATH: /home/Ubuntu/Downloads/Unmodel/Reference_models/clip_l/
# MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
MAX_LENGTH: 77
# OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
OUTPUT_KEY: pooler_output
# D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
D_TYPE: bfloat16
# BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
BATCH_INFER: True
CLEAN: whitespace
TUNER:
# THE LORA PARAMETERS
- NAME: SwiftLoRA
R: 64
LORA_ALPHA: 64
LORA_DROPOUT: 0.0
BIAS: "none"
TARGET_MODULES: "(model.double_blocks.*(.qkv|.img_mlp.0|.img_mlp.2|.txt_mlp.0|.txt_mlp.2|.proj|.img_mod.lin|.txt_mod.lin))|(model.single_blocks.*(.linear1|.linear2|.modulation.lin))$"
#
SAMPLE_ARGS:
SAMPLE_STEPS: 28
SAMPLER: flow_euler
SEED: 42
IMAGE_SIZE: [ 1024, 1024 ]
GUIDE_SCALE: 50
LR_SCHEDULER:
NAME: StepAnnealingLR
WARMUP_STEPS: 0
TOTAL_STEPS: 100000
DECAY_MODE: 'cosine'
#
OPTIMIZER:
NAME: AdamW
LEARNING_RATE: 1e-3
BETAS: [ 0.9, 0.999 ]
EPS: 1e-6
WEIGHT_DECAY: 1e-2
AMSGRAD: False
#
TRAIN_DATA:
NAME: ACEPlusDataset
MODE: train
DATA_LIST: data/train.csv
DELIMITER: "#;#"
# input_image, input_mask, input_reference_image, target_image, instruction, task_type
FIELDS: ["edit_image", "edit_mask", "ref_image", "target_image", "prompt", "data_type"]
PATH_PREFIX: ""
EDIT_TYPE_LIST: []
MAX_SEQ_LEN: 2048
D: 16
PIN_MEMORY: True
BATCH_SIZE: 1
NUM_WORKERS: 4
SAMPLER:
NAME: LoopSampler
EVAL_DATA:
NAME: ACEPlusDataset
MODE: eval
DATA_LIST: data/train.csv
DELIMITER: "#;#"
# input_image, input_mask, input_reference_image, target_image, instruction, task_type
FIELDS: [ "edit_image", "edit_mask", "ref_image", "target_image", "prompt", "data_type" ]
PATH_PREFIX: ""
EDIT_TYPE_LIST: [ ]
MAX_SEQ_LEN: 2048
D: 16
PIN_MEMORY: True
BATCH_SIZE: 1
NUM_WORKERS: 4
TRAIN_HOOKS:
- NAME: ACEBackwardHook
GRADIENT_CLIP: 1.0
PRIORITY: 10
- NAME: LogHook
LOG_INTERVAL: 20
- NAME: ACECheckpointHook
INTERVAL: 10
# INTERVAL: 250
PRIORITY: 200
DISABLE_SNAPSHOT: True
- NAME: ProbeDataHook
PROB_INTERVAL: 50
PRIORITY: 0
- NAME: TensorboardLogHook
LOG_INTERVAL: 50
EVAL_HOOKS:
- NAME: ProbeDataHook
PROB_INTERVAL: 10
#PROB_INTERVAL: 50
PRIORITY: 0