|
|
CACHE_DIR=None |
|
|
PRETRAINED_MODEL=None |
|
|
|
|
|
num_frames = 16 |
|
|
frame_interval = 1 |
|
|
image_size = (256, 256) |
|
|
fps = 30//2 |
|
|
|
|
|
|
|
|
root = None |
|
|
data_path = "CSV_PATH" |
|
|
use_image_transform = False |
|
|
num_workers = 6 |
|
|
|
|
|
|
|
|
dtype = "fp16" |
|
|
grad_checkpoint = False |
|
|
plugin = "zero2" |
|
|
sp_size = 1 |
|
|
data_prefetch = 1 |
|
|
|
|
|
|
|
|
MODEL_DIM = 1152 |
|
|
CAMERA_FORMAT = 'extrinsic' |
|
|
CAMERA_PARAMS_NUM = 12 |
|
|
|
|
|
|
|
|
model = dict( |
|
|
type="STDiT-XL/2", |
|
|
space_scale=0.5, |
|
|
time_scale=1.0, |
|
|
from_pretrained=PRETRAINED_MODEL, |
|
|
enable_flashattn=True, |
|
|
enable_layernorm_kernel=True, |
|
|
camera_fuser_linear_dims=[MODEL_DIM+CAMERA_PARAMS_NUM, MODEL_DIM], |
|
|
camera_format=CAMERA_FORMAT |
|
|
) |
|
|
vae = dict( |
|
|
type="VideoAutoencoderKL", |
|
|
from_pretrained="stabilityai/sd-vae-ft-ema", |
|
|
cache_dir=CACHE_DIR, |
|
|
) |
|
|
text_encoder = dict( |
|
|
type="t5", |
|
|
from_pretrained="DeepFloyd/t5-v1_1-xxl", |
|
|
model_max_length=120, |
|
|
shardformer=True, |
|
|
cache_dir=CACHE_DIR, |
|
|
) |
|
|
scheduler = dict( |
|
|
type="iddpm_camera", |
|
|
|
|
|
cfg_scale_t=6.0, |
|
|
cfg_scale_c=4.0 |
|
|
) |
|
|
|
|
|
|
|
|
seed = 42 |
|
|
wandb = True |
|
|
|
|
|
epochs = 12 |
|
|
log_every = 300 |
|
|
ckpt_every = 2000 |
|
|
|
|
|
dataset = dict( |
|
|
text_dropout=0.05, |
|
|
camera_dropout=0.05, |
|
|
static_camera_rate=0.0, |
|
|
resolution=256, |
|
|
version='v0.7', |
|
|
frame_strides=[4, 5, 6, 7, 8], |
|
|
plucker_coord=False, |
|
|
expand_rt=False |
|
|
) |
|
|
|
|
|
load = None |
|
|
batch_size = 6 |
|
|
lr = 1e-5 |
|
|
grad_clip = 1.0 |
|
|
freeze_model = True |
|
|
active_layer_names = ['camera_fuser', 'attn_temp'] |
|
|
|
|
|
|
|
|
prompt_path = "./assets/texts/realestate10k.txt" |
|
|
|
|
|
|
|
|
camera_path = '' |
|
|
nprompts = None |
|
|
save_dir = None |