Add FASTGEN_SF_OUTPUT
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +0 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug-internal.log +6 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug.log +19 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/output.log +0 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/requirements.txt +222 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/wandb-metadata.json +68 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-core.log +8 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log +6 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log +19 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/run-zmgbhqqw.wandb +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb_id.txt +1 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug-internal.log +6 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug.log +19 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/output.log +739 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/requirements.txt +222 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/wandb-metadata.json +68 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-core.log +8 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log +6 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log +19 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/run-nkf4iovm.wandb +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb_id.txt +1 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/wandb_id.txt +1 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug-internal.log +11 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug.log +21 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/config.yaml +362 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/output.log +310 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/requirements.txt +222 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-metadata.json +69 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-summary.json +1 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-core.log +15 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log +11 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log +21 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/run-spcd04xe.wandb +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb_id.txt +1 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/config.yaml +259 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/.metadata +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__0_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__1_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__2_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__3_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.pth +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/.metadata +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__0_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__1_0.distcp +3 -0
- FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__2_0.distcp +3 -0
.gitattributes
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7fa0fef8fb00>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_4gpu_bs8_5000iter_shift5
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: online
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7fa0fefee660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7fa0fefee660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7fa0fefee660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '5000'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T20:58:42.181919016+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-02T20:58:42.574143607+09:00","level":"INFO","msg":"stream: created new stream","id":"zmgbhqqw"}
|
| 3 |
+
{"time":"2026-04-02T20:58:42.574192032+09:00","level":"INFO","msg":"handler: started","stream_id":"zmgbhqqw"}
|
| 4 |
+
{"time":"2026-04-02T20:58:42.574261225+09:00","level":"INFO","msg":"stream: started","id":"zmgbhqqw"}
|
| 5 |
+
{"time":"2026-04-02T20:58:42.574281395+09:00","level":"INFO","msg":"writer: started","stream_id":"zmgbhqqw"}
|
| 6 |
+
{"time":"2026-04-02T20:58:42.57428521+09:00","level":"INFO","msg":"sender: started","stream_id":"zmgbhqqw"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Configure stats pid to 549927
|
| 3 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log
|
| 5 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log
|
| 6 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_5000iter_shift5', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7fbeacc8fb00>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-02 20:58:42,167 INFO MainThread:549927 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-02 20:58:42,179 INFO MainThread:549927 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-02 20:58:42,183 INFO MainThread:549927 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-02 20:58:42,199 INFO MainThread:549927 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-02 20:58:43,281 INFO MainThread:549927 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-02 20:58:43,513 INFO MainThread:549927 [wandb_init.py:init():1082] run started, returning control to user process
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/requirements.txt
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastgen==0.1.0
|
| 2 |
+
nvitop==1.6.1
|
| 3 |
+
ftfy==6.3.1
|
| 4 |
+
braceexpand==0.1.7
|
| 5 |
+
antlr4-python3-runtime==4.9.3
|
| 6 |
+
webdataset==1.0.2
|
| 7 |
+
sentry-sdk==2.53.0
|
| 8 |
+
rdkit==2025.9.5
|
| 9 |
+
python-dotenv==1.2.1
|
| 10 |
+
proglog==0.1.12
|
| 11 |
+
omegaconf==2.3.0
|
| 12 |
+
narwhals==2.17.0
|
| 13 |
+
loguru==0.7.3
|
| 14 |
+
imageio-ffmpeg==0.6.0
|
| 15 |
+
plotly==6.5.2
|
| 16 |
+
moviepy==2.2.1
|
| 17 |
+
hydra-core==1.3.2
|
| 18 |
+
wandb==0.25.0
|
| 19 |
+
fastgen==0.1.0
|
| 20 |
+
packaging==25.0
|
| 21 |
+
setuptools==80.10.2
|
| 22 |
+
wheel==0.46.3
|
| 23 |
+
pip==26.0.1
|
| 24 |
+
webencodings==0.5.1
|
| 25 |
+
pure_eval==0.2.3
|
| 26 |
+
ptyprocess==0.7.0
|
| 27 |
+
nvidia-ml-py==13.590.48
|
| 28 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 29 |
+
mpmath==1.3.0
|
| 30 |
+
fastjsonschema==2.21.2
|
| 31 |
+
zipp==3.23.0
|
| 32 |
+
xyzservices==2025.11.0
|
| 33 |
+
widgetsnbextension==4.0.15
|
| 34 |
+
websocket-client==1.9.0
|
| 35 |
+
webcolors==25.10.0
|
| 36 |
+
wcwidth==0.6.0
|
| 37 |
+
urllib3==2.6.3
|
| 38 |
+
uri-template==1.3.0
|
| 39 |
+
tzdata==2025.3
|
| 40 |
+
typing_extensions==4.15.0
|
| 41 |
+
triton==3.6.0
|
| 42 |
+
traitlets==5.14.3
|
| 43 |
+
tqdm==4.67.3
|
| 44 |
+
tornado==6.5.5
|
| 45 |
+
tinycss2==1.4.0
|
| 46 |
+
sympy==1.14.0
|
| 47 |
+
soupsieve==2.8.3
|
| 48 |
+
smmap==5.0.3
|
| 49 |
+
six==1.16.0
|
| 50 |
+
sentencepiece==0.2.1
|
| 51 |
+
Send2Trash==2.1.0
|
| 52 |
+
safetensors==0.7.0
|
| 53 |
+
rpds-py==0.30.0
|
| 54 |
+
rfc3986-validator==0.1.1
|
| 55 |
+
regex==2026.2.28
|
| 56 |
+
pyzmq==27.1.0
|
| 57 |
+
PyYAML==6.0.3
|
| 58 |
+
python-json-logger==4.0.0
|
| 59 |
+
Pygments==2.19.2
|
| 60 |
+
pycparser==3.0
|
| 61 |
+
psutil==7.2.2
|
| 62 |
+
protobuf==4.24.4
|
| 63 |
+
prometheus_client==0.24.1
|
| 64 |
+
platformdirs==4.9.4
|
| 65 |
+
pillow==11.3.0
|
| 66 |
+
pexpect==4.9.0
|
| 67 |
+
parso==0.8.6
|
| 68 |
+
pandocfilters==1.5.1
|
| 69 |
+
nvidia-nvtx-cu12==12.8.90
|
| 70 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 71 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 72 |
+
nvidia-nccl-cu12==2.27.5
|
| 73 |
+
nvidia-curand-cu12==10.3.9.90
|
| 74 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 76 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 77 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 78 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 79 |
+
numpy==1.26.4
|
| 80 |
+
networkx==3.6.1
|
| 81 |
+
nest-asyncio==1.6.0
|
| 82 |
+
mistune==3.2.0
|
| 83 |
+
MarkupSafe==3.0.3
|
| 84 |
+
lark==1.3.1
|
| 85 |
+
jupyterlab_widgets==3.0.16
|
| 86 |
+
jupyterlab_pygments==0.3.0
|
| 87 |
+
jsonpointer==3.0.0
|
| 88 |
+
json5==0.13.0
|
| 89 |
+
jmespath==1.1.0
|
| 90 |
+
idna==3.11
|
| 91 |
+
hf-xet==1.4.2
|
| 92 |
+
h11==0.16.0
|
| 93 |
+
fsspec==2026.2.0
|
| 94 |
+
fqdn==1.5.1
|
| 95 |
+
filelock==3.25.2
|
| 96 |
+
executing==2.2.1
|
| 97 |
+
einops==0.8.2
|
| 98 |
+
defusedxml==0.7.1
|
| 99 |
+
decorator==5.2.1
|
| 100 |
+
debugpy==1.8.20
|
| 101 |
+
cuda-pathfinder==1.4.2
|
| 102 |
+
comm==0.2.3
|
| 103 |
+
click==8.3.1
|
| 104 |
+
charset-normalizer==3.4.5
|
| 105 |
+
certifi==2026.2.25
|
| 106 |
+
bleach==6.3.0
|
| 107 |
+
babel==2.18.0
|
| 108 |
+
av==17.0.0
|
| 109 |
+
attrs==25.4.0
|
| 110 |
+
async-lru==2.2.0
|
| 111 |
+
asttokens==3.0.1
|
| 112 |
+
annotated-types==0.7.0
|
| 113 |
+
typing-inspection==0.4.2
|
| 114 |
+
terminado==0.18.1
|
| 115 |
+
stack-data==0.6.3
|
| 116 |
+
scipy==1.17.1
|
| 117 |
+
rfc3987-syntax==1.1.0
|
| 118 |
+
rfc3339-validator==0.1.4
|
| 119 |
+
requests==2.32.5
|
| 120 |
+
referencing==0.37.0
|
| 121 |
+
python-dateutil==2.9.0.post0
|
| 122 |
+
pydantic_core==2.41.5
|
| 123 |
+
prompt_toolkit==3.0.52
|
| 124 |
+
opencv-python-headless==4.11.0.86
|
| 125 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 126 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 127 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 128 |
+
matplotlib-inline==0.2.1
|
| 129 |
+
jupyter_core==5.9.1
|
| 130 |
+
Jinja2==3.1.6
|
| 131 |
+
jedi==0.19.2
|
| 132 |
+
ipython_pygments_lexers==1.1.1
|
| 133 |
+
importlib_metadata==8.7.1
|
| 134 |
+
ImageIO==2.37.3
|
| 135 |
+
httpcore==1.0.9
|
| 136 |
+
gitdb==4.0.12
|
| 137 |
+
cuda-bindings==12.9.4
|
| 138 |
+
contourpy==1.3.3
|
| 139 |
+
cffi==2.0.0
|
| 140 |
+
beautifulsoup4==4.14.3
|
| 141 |
+
anyio==4.12.1
|
| 142 |
+
soundfile==0.13.1
|
| 143 |
+
pydantic==2.12.5
|
| 144 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 145 |
+
jupyter_server_terminals==0.5.4
|
| 146 |
+
jupyter_client==8.8.0
|
| 147 |
+
jsonschema-specifications==2025.9.1
|
| 148 |
+
ipython==9.11.0
|
| 149 |
+
httpx==0.28.1
|
| 150 |
+
GitPython==3.1.46
|
| 151 |
+
botocore==1.42.68
|
| 152 |
+
bokeh==3.9.0
|
| 153 |
+
arrow==1.4.0
|
| 154 |
+
argon2-cffi-bindings==25.1.0
|
| 155 |
+
torch==2.10.0
|
| 156 |
+
s3transfer==0.16.0
|
| 157 |
+
jsonschema==4.26.0
|
| 158 |
+
isoduration==20.11.0
|
| 159 |
+
ipywidgets==8.1.8
|
| 160 |
+
ipykernel==7.2.0
|
| 161 |
+
argon2-cffi==25.1.0
|
| 162 |
+
torchvision==0.25.0
|
| 163 |
+
nbformat==5.10.4
|
| 164 |
+
jupyter-console==6.6.3
|
| 165 |
+
boto3==1.42.68
|
| 166 |
+
accelerate==1.13.0
|
| 167 |
+
nbclient==0.10.4
|
| 168 |
+
jupyter-events==0.12.0
|
| 169 |
+
nbconvert==7.17.0
|
| 170 |
+
jupyter_server==2.17.0
|
| 171 |
+
notebook_shim==0.2.4
|
| 172 |
+
jupyterlab_server==2.28.0
|
| 173 |
+
jupyter-lsp==2.3.0
|
| 174 |
+
jupyterlab==4.5.6
|
| 175 |
+
notebook==7.5.5
|
| 176 |
+
jupyter==1.1.1
|
| 177 |
+
fastgen==0.1.0
|
| 178 |
+
pandas==3.0.1
|
| 179 |
+
shellingham==1.5.4
|
| 180 |
+
mdurl==0.1.2
|
| 181 |
+
annotated-doc==0.0.4
|
| 182 |
+
markdown-it-py==4.0.0
|
| 183 |
+
rich==14.3.3
|
| 184 |
+
typer==0.24.1
|
| 185 |
+
huggingface_hub==1.7.1
|
| 186 |
+
timm==1.0.25
|
| 187 |
+
tokenizers==0.22.2
|
| 188 |
+
diffusers==0.37.0
|
| 189 |
+
transformers==5.3.0
|
| 190 |
+
peft==0.18.1
|
| 191 |
+
easydict==1.13
|
| 192 |
+
lmdb==2.2.0
|
| 193 |
+
threadpoolctl==3.6.0
|
| 194 |
+
soxr==1.0.0
|
| 195 |
+
msgpack==1.1.2
|
| 196 |
+
llvmlite==0.47.0
|
| 197 |
+
lazy-loader==0.5
|
| 198 |
+
joblib==1.5.3
|
| 199 |
+
audioread==3.1.0
|
| 200 |
+
scikit-learn==1.8.0
|
| 201 |
+
pooch==1.9.0
|
| 202 |
+
numba==0.65.0
|
| 203 |
+
librosa==0.11.0
|
| 204 |
+
simsimd==6.5.16
|
| 205 |
+
flatbuffers==25.12.19
|
| 206 |
+
tifffile==2026.3.3
|
| 207 |
+
stringzilla==4.6.0
|
| 208 |
+
pyparsing==3.3.2
|
| 209 |
+
prettytable==3.17.0
|
| 210 |
+
onnx==1.17.0
|
| 211 |
+
kiwisolver==1.5.0
|
| 212 |
+
fonttools==4.62.1
|
| 213 |
+
Cython==3.2.4
|
| 214 |
+
cycler==0.12.1
|
| 215 |
+
scikit-image==0.26.0
|
| 216 |
+
onnxruntime==1.24.4
|
| 217 |
+
matplotlib==3.10.8
|
| 218 |
+
albucore==0.0.24
|
| 219 |
+
albumentations==2.0.8
|
| 220 |
+
insightface==0.7.3
|
| 221 |
+
kornia_rs==0.1.10
|
| 222 |
+
kornia==0.8.2
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
|
| 3 |
+
"python": "CPython 3.12.12",
|
| 4 |
+
"startedAt": "2026-04-02T11:58:41.929718Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
|
| 7 |
+
"-",
|
| 8 |
+
"trainer.resume=False",
|
| 9 |
+
"log_config.name=sf_4gpu_bs8_5000iter_shift5",
|
| 10 |
+
"log_config.project=OmniAvatar-FastGen"
|
| 11 |
+
],
|
| 12 |
+
"program": "/home/work/.local/hyunbin/FastGen/train.py",
|
| 13 |
+
"codePath": "train.py",
|
| 14 |
+
"codePathLocal": "train.py",
|
| 15 |
+
"git": {
|
| 16 |
+
"remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
|
| 17 |
+
"commit": "dead092792003faa07babff77ccd223af4ad9b11"
|
| 18 |
+
},
|
| 19 |
+
"email": "paul.hyunbin@gmail.com",
|
| 20 |
+
"root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5",
|
| 21 |
+
"host": "main1",
|
| 22 |
+
"executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
|
| 23 |
+
"cpu_count": 112,
|
| 24 |
+
"cpu_count_logical": 224,
|
| 25 |
+
"gpu": "NVIDIA H200",
|
| 26 |
+
"gpu_count": 4,
|
| 27 |
+
"disk": {
|
| 28 |
+
"/": {
|
| 29 |
+
"total": "1356758433792",
|
| 30 |
+
"used": "257961558016"
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"memory": {
|
| 34 |
+
"total": "2163961778176"
|
| 35 |
+
},
|
| 36 |
+
"gpu_nvidia": [
|
| 37 |
+
{
|
| 38 |
+
"name": "NVIDIA H200",
|
| 39 |
+
"memoryTotal": "150754820096",
|
| 40 |
+
"cudaCores": 16896,
|
| 41 |
+
"architecture": "Hopper",
|
| 42 |
+
"uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "NVIDIA H200",
|
| 46 |
+
"memoryTotal": "150754820096",
|
| 47 |
+
"cudaCores": 16896,
|
| 48 |
+
"architecture": "Hopper",
|
| 49 |
+
"uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"name": "NVIDIA H200",
|
| 53 |
+
"memoryTotal": "150754820096",
|
| 54 |
+
"cudaCores": 16896,
|
| 55 |
+
"architecture": "Hopper",
|
| 56 |
+
"uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "NVIDIA H200",
|
| 60 |
+
"memoryTotal": "150754820096",
|
| 61 |
+
"cudaCores": 16896,
|
| 62 |
+
"architecture": "Hopper",
|
| 63 |
+
"uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"cudaVersion": "12.9",
|
| 67 |
+
"writerId": "n2ybi81tgd0arslahhy2n7g532wc0pja"
|
| 68 |
+
}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-core.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T20:58:41.981092557+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpwwrvdk3u/port-549927.txt","pid":549927,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-02T20:58:41.98157225+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":549927}
|
| 3 |
+
{"time":"2026-04-02T20:58:41.981562902+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-549927-558790-3160372452/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-04-02T20:58:42.16745332+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-02T20:58:42.181823375+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"zmgbhqqw","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-02T20:58:42.574268009+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"zmgbhqqw","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-02T20:58:48.512451301+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"6lpqhxkfa0yx"}
|
| 8 |
+
{"time":"2026-04-02T21:41:09.853455246+09:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T20:58:42.181919016+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-02T20:58:42.574143607+09:00","level":"INFO","msg":"stream: created new stream","id":"zmgbhqqw"}
|
| 3 |
+
{"time":"2026-04-02T20:58:42.574192032+09:00","level":"INFO","msg":"handler: started","stream_id":"zmgbhqqw"}
|
| 4 |
+
{"time":"2026-04-02T20:58:42.574261225+09:00","level":"INFO","msg":"stream: started","id":"zmgbhqqw"}
|
| 5 |
+
{"time":"2026-04-02T20:58:42.574281395+09:00","level":"INFO","msg":"writer: started","stream_id":"zmgbhqqw"}
|
| 6 |
+
{"time":"2026-04-02T20:58:42.57428521+09:00","level":"INFO","msg":"sender: started","stream_id":"zmgbhqqw"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Configure stats pid to 549927
|
| 3 |
+
2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log
|
| 5 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log
|
| 6 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_5000iter_shift5', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7fbeacc8fb00>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-02 20:58:42,167 INFO MainThread:549927 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-02 20:58:42,179 INFO MainThread:549927 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-02 20:58:42,183 INFO MainThread:549927 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-02 20:58:42,199 INFO MainThread:549927 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-02 20:58:43,281 INFO MainThread:549927 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-02 20:58:43,513 INFO MainThread:549927 [wandb_init.py:init():1082] run started, returning control to user process
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/run-zmgbhqqw.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7d6105253ff0592490ce0e6460f37480df990ab3d55586489772eaeeb75f982
|
| 3 |
+
size 688128
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb_id.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
zmgbhqqw
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7fc7a89d7ce0>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_4gpu_bs8_lr2e6_5000iter_shift5_combined
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: online
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7fc7a8c2e660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7fc7a8c2e660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7fc7a8c2e660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '5000'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-03T00:01:32.430639522+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-03T00:01:32.800089951+09:00","level":"INFO","msg":"stream: created new stream","id":"nkf4iovm"}
|
| 3 |
+
{"time":"2026-04-03T00:01:32.800139938+09:00","level":"INFO","msg":"handler: started","stream_id":"nkf4iovm"}
|
| 4 |
+
{"time":"2026-04-03T00:01:32.800233729+09:00","level":"INFO","msg":"stream: started","id":"nkf4iovm"}
|
| 5 |
+
{"time":"2026-04-03T00:01:32.80025365+09:00","level":"INFO","msg":"sender: started","stream_id":"nkf4iovm"}
|
| 6 |
+
{"time":"2026-04-03T00:01:32.800252986+09:00","level":"INFO","msg":"writer: started","stream_id":"nkf4iovm"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Configure stats pid to 1047116
|
| 3 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log
|
| 5 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log
|
| 6 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_lr2e6_5000iter_shift5_combined', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7f328019bce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-03 00:01:32,416 INFO MainThread:1047116 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-03 00:01:32,428 INFO MainThread:1047116 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-03 00:01:32,432 INFO MainThread:1047116 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-03 00:01:32,448 INFO MainThread:1047116 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-03 00:01:33,668 INFO MainThread:1047116 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-03 00:01:33,986 INFO MainThread:1047116 [wandb_init.py:init():1082] run started, returning control to user process
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/output.log
ADDED
|
@@ -0,0 +1,739 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.callbacks.wandb:init_wandb:259 ] Wandb code upload disabled (set WANDB_UPLOAD_CODE=true to enable)
|
| 2 |
+
[Apr 3, 2026 - 00:01:33 | SUCCESS | fastgen.trainer:__init__:53 ] Callbacks initialized successfully
|
| 3 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:__init__:57 ] Callback synchronization complete
|
| 4 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:__init__:60 ] Initializing checkpointer...
|
| 5 |
+
[Apr 3, 2026 - 00:01:33 | SUCCESS | fastgen.trainer:__init__:65 ] Checkpointer initialized successfully
|
| 6 |
+
[Apr 3, 2026 - 00:01:33 | SUCCESS | __main__:main:33 ] Trainer initialized successfully
|
| 7 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:run:77 ] Starting training
|
| 8 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:run:80 ] Initializing callbacks and model ...
|
| 9 |
+
[Apr 3, 2026 - 00:01:33 | INFO | fastgen.utils.checkpointer:load:151 ] Loading model from /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 10 |
+
[Apr 3, 2026 - 00:01:38 | INFO | fastgen.utils.checkpointer:load:154 ] Loading the model_dict...
|
| 11 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.checkpointer:load:159 ] Model net, loading info: <All keys matched successfully>
|
| 12 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:load_pretrained_ckpt:252 ] Loaded net model from net in /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth at iteration 5000
|
| 13 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:load_pretrained_ckpt:262 ] Setting resume_iter for model to 5000.
|
| 14 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:95 ] Starting model.on_train_begin ...
|
| 15 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.methods.model:on_train_begin:296 ] Teacher check: add_teacher_to_fsdp_dict=True, fsdp_dict keys=['net', 'fake_score', 'teacher'], teacher in fsdp_dict=True
|
| 16 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:99 ] model.on_train_begin completed
|
| 17 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:110 ] Wrapping model into fsdp ..
|
| 18 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:102 ] Fully sharding model with 4 ranks...
|
| 19 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'net' (1.42B params)...
|
| 20 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.networks.OmniAvatar.network_causal:fully_shard:1950 ] CausalOmniAvatarWan: keeping manual gradient checkpointing (not using apply_fsdp_checkpointing due to KV cache dynamics)
|
| 21 |
+
[Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 22 |
+
[Apr 3, 2026 - 00:01:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped net in 1.1s
|
| 23 |
+
[Apr 3, 2026 - 00:01:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'fake_score' (1.60B params)...
|
| 24 |
+
[Apr 3, 2026 - 00:01:51 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
|
| 25 |
+
[Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 26 |
+
[Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped fake_score in 1.5s
|
| 27 |
+
[Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'teacher' (14.29B params)...
|
| 28 |
+
[Apr 3, 2026 - 00:01:52 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
|
| 29 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 30 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped teacher in 10.9s
|
| 31 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:118 ] FSDP wrapping completed
|
| 32 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.callbacks.ema:on_model_init_end:64 ] EMA ema is not enabled, skipping callback.
|
| 33 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:133 ] Auto-Resume Details: None
|
| 34 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.basic_utils:set_random_seed:144 ] Using random seed 0.
|
| 35 |
+
[Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:165 ] Instantiating dataloader...
|
| 36 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.wandb:on_dataloader_init_end:361 ] SKIP_GT_VAL_UPLOAD=1 — skipping GT val video upload
|
| 37 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.gpu_stats:on_train_begin:57 ] every_n to measure gpus stats: 1
|
| 38 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.train_profiler:on_train_begin:54 ] every_n to profile trainer: 1
|
| 39 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] model (OmniAvatarSelfForcingModel) has 1596.36 M trainable and 17311.83 M total params (logical).
|
| 40 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.42 M trainable and 4350.43 M total params LOCAL on rank 0.
|
| 41 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 1.
|
| 42 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 2.
|
| 43 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 3.
|
| 44 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] net (CausalOmniAvatarWan) has 1421.38 M trainable and 1421.38 M total params (logical).
|
| 45 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.68 M trainable and 376.68 M total params LOCAL on rank 0.
|
| 46 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 1.
|
| 47 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 2.
|
| 48 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 3.
|
| 49 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] fake_score (OmniAvatarWan) has 174.98 M trainable and 1596.36 M total params (logical).
|
| 50 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.30 M total params LOCAL on rank 0.
|
| 51 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 1.
|
| 52 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 2.
|
| 53 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 3.
|
| 54 |
+
[Apr 3, 2026 - 00:02:09 | INFO | fastgen.trainer:run:174 ] iter_start: 0
|
| 55 |
+
[MEM] fake_score_update: START: alloc=9.45GB reserved=9.88GB peak=9.60GB
|
| 56 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=12.28GB reserved=49.39GB peak=45.74GB
|
| 57 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=13.16GB peak=45.74GB
|
| 58 |
+
[MEM-fwd] after block 0: alloc=13.74GB peak=45.74GB
|
| 59 |
+
[MEM-fwd] after block 10: alloc=19.55GB peak=45.74GB
|
| 60 |
+
[MEM-fwd] after block 20: alloc=24.84GB peak=45.74GB
|
| 61 |
+
[MEM-fwd] after block 29: alloc=29.59GB peak=45.74GB
|
| 62 |
+
[MEM-fwd] after head+unpatchify: alloc=30.67GB peak=45.74GB
|
| 63 |
+
[MEM] fake_score_update: START: alloc=13.25GB reserved=15.24GB peak=54.53GB
|
| 64 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.29GB reserved=47.38GB peak=46.75GB
|
| 65 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.17GB peak=46.75GB
|
| 66 |
+
[MEM-fwd] after block 0: alloc=14.75GB peak=46.75GB
|
| 67 |
+
[MEM-fwd] after block 10: alloc=20.56GB peak=46.75GB
|
| 68 |
+
[MEM-fwd] after block 20: alloc=25.85GB peak=46.75GB
|
| 69 |
+
[MEM-fwd] after block 29: alloc=30.61GB peak=46.75GB
|
| 70 |
+
[MEM-fwd] after head+unpatchify: alloc=31.69GB peak=46.75GB
|
| 71 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 72 |
+
Avg Max Min
|
| 73 |
+
cpu_mem_gb 38.485269 38.565594 38.411797
|
| 74 |
+
peak_gpu_mem_gb 51.700073 51.700073 51.700073
|
| 75 |
+
peak_gpu_mem_reserved_gb 53.640625 53.640625 53.640625
|
| 76 |
+
util 89.250000 96.000000 84.000000
|
| 77 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 1 : data loading time 0.81
|
| 78 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 1 : avg forward pass time 15.10
|
| 79 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 1 : backward pass time 11.36
|
| 80 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 1 : optimizer step time 1.27
|
| 81 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 1--------------------
|
| 82 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0068 iter count: 1.0
|
| 83 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0068 iter count: 1.0
|
| 84 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 85 |
+
[Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 86 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=57.60GB peak=55.51GB
|
| 87 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.47GB peak=46.93GB
|
| 88 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 89 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 90 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 91 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 92 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 93 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 94 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.37GB peak=55.69GB
|
| 95 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.25GB peak=47.71GB
|
| 96 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.71GB
|
| 97 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.71GB
|
| 98 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.71GB
|
| 99 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.71GB
|
| 100 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.71GB
|
| 101 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.71GB
|
| 102 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 103 |
+
Avg Max Min
|
| 104 |
+
cpu_mem_gb 38.658718 38.739353 38.585140
|
| 105 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 106 |
+
peak_gpu_mem_reserved_gb 54.365234 54.365234 54.365234
|
| 107 |
+
util 95.750000 100.000000 92.000000
|
| 108 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 2 : avg iteration time 51.59 seconds
|
| 109 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 2 : data loading time 0.00
|
| 110 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 2 : avg forward pass time 13.82
|
| 111 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 2 : backward pass time 11.41
|
| 112 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 2 : optimizer step time 1.17
|
| 113 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 2--------------------
|
| 114 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
|
| 115 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
|
| 116 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 117 |
+
[Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 118 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.37GB peak=56.47GB
|
| 119 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
|
| 120 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 121 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 122 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 123 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 124 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 125 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 126 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.40GB peak=55.69GB
|
| 127 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.29GB peak=47.72GB
|
| 128 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
|
| 129 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
|
| 130 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
|
| 131 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
|
| 132 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
|
| 133 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
|
| 134 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 135 |
+
Avg Max Min
|
| 136 |
+
cpu_mem_gb 38.659063 38.740227 38.585308
|
| 137 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 138 |
+
peak_gpu_mem_reserved_gb 54.404297 54.404297 54.404297
|
| 139 |
+
util 96.750000 100.000000 92.000000
|
| 140 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 3 : avg iteration time 58.07 seconds
|
| 141 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 3 : data loading time 0.00
|
| 142 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 3 : avg forward pass time 17.04
|
| 143 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 3 : backward pass time 11.38
|
| 144 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 3 : optimizer step time 1.18
|
| 145 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 3--------------------
|
| 146 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
|
| 147 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
|
| 148 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 149 |
+
[Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 150 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.42GB peak=56.47GB
|
| 151 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.52GB peak=46.93GB
|
| 152 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 153 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 154 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 155 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 156 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 157 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 158 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.42GB peak=55.69GB
|
| 159 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.27GB peak=47.72GB
|
| 160 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
|
| 161 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
|
| 162 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
|
| 163 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
|
| 164 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
|
| 165 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
|
| 166 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 167 |
+
Avg Max Min
|
| 168 |
+
cpu_mem_gb 38.659183 38.740608 38.585339
|
| 169 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 170 |
+
peak_gpu_mem_reserved_gb 54.443359 54.443359 54.443359
|
| 171 |
+
util 95.250000 100.000000 91.000000
|
| 172 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 4 : avg iteration time 57.93 seconds
|
| 173 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 4 : data loading time 0.00
|
| 174 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 4 : avg forward pass time 17.00
|
| 175 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 4 : backward pass time 11.36
|
| 176 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 4 : optimizer step time 1.18
|
| 177 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 4--------------------
|
| 178 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
|
| 179 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
|
| 180 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 181 |
+
[Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 182 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.46GB peak=56.47GB
|
| 183 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
|
| 184 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 185 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 186 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 187 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 188 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 189 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 190 |
+
[MEM] student_update: START: alloc=14.32GB reserved=58.46GB peak=55.75GB
|
| 191 |
+
[MEM] student_update: after rollout: alloc=63.85GB reserved=66.24GB peak=65.62GB
|
| 192 |
+
[MEM] student_update: after perturb: alloc=63.87GB reserved=66.24GB peak=65.62GB
|
| 193 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=64.62GB peak=65.62GB
|
| 194 |
+
[MEM-fwd] after block 0: alloc=64.67GB peak=75.08GB
|
| 195 |
+
[MEM-fwd] after block 10: alloc=65.20GB peak=75.61GB
|
| 196 |
+
[MEM-fwd] after block 20: alloc=65.20GB peak=75.61GB
|
| 197 |
+
[MEM-fwd] after block 29: alloc=65.20GB peak=75.61GB
|
| 198 |
+
[MEM-fwd] after head+unpatchify: alloc=64.69GB peak=75.61GB
|
| 199 |
+
[MEM] student_update: after fake_score: alloc=63.95GB reserved=78.02GB peak=75.61GB
|
| 200 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.83GB peak=75.61GB
|
| 201 |
+
[MEM-fwd] after block 0: alloc=67.05GB peak=88.90GB
|
| 202 |
+
[MEM-fwd] after block 10: alloc=68.82GB peak=90.66GB
|
| 203 |
+
[MEM-fwd] after block 20: alloc=68.82GB peak=90.66GB
|
| 204 |
+
[MEM-fwd] after block 30: alloc=68.82GB peak=90.66GB
|
| 205 |
+
[MEM-fwd] after block 39: alloc=68.82GB peak=90.66GB
|
| 206 |
+
[MEM-fwd] after head+unpatchify: alloc=67.08GB peak=90.66GB
|
| 207 |
+
[MEM] student_update: after teacher: alloc=64.45GB reserved=93.79GB peak=90.66GB
|
| 208 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.37GB peak=90.66GB
|
| 209 |
+
[MEM-fwd] after block 0: alloc=67.08GB peak=90.66GB
|
| 210 |
+
[MEM-fwd] after block 10: alloc=68.84GB peak=90.68GB
|
| 211 |
+
[MEM-fwd] after block 20: alloc=68.84GB peak=90.68GB
|
| 212 |
+
[MEM-fwd] after block 30: alloc=68.84GB peak=90.68GB
|
| 213 |
+
[MEM-fwd] after block 39: alloc=68.84GB peak=90.68GB
|
| 214 |
+
[MEM-fwd] after head+unpatchify: alloc=67.10GB peak=90.68GB
|
| 215 |
+
[MEM] student_update: after CFG: alloc=64.45GB reserved=93.83GB peak=90.68GB
|
| 216 |
+
[MEM] student_update: after VSD loss: alloc=64.47GB reserved=93.83GB peak=90.68GB
|
| 217 |
+
[MEM] fake_score_update: START: alloc=12.68GB reserved=72.00GB peak=90.68GB
|
| 218 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.51GB reserved=52.62GB peak=48.98GB
|
| 219 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.28GB peak=48.98GB
|
| 220 |
+
[MEM-fwd] after block 0: alloc=16.91GB peak=48.98GB
|
| 221 |
+
[MEM-fwd] after block 10: alloc=22.73GB peak=48.98GB
|
| 222 |
+
[MEM-fwd] after block 20: alloc=28.01GB peak=48.98GB
|
| 223 |
+
[MEM-fwd] after block 29: alloc=32.77GB peak=48.98GB
|
| 224 |
+
[MEM-fwd] after head+unpatchify: alloc=33.85GB peak=48.98GB
|
| 225 |
+
[MEM] student_update: START: alloc=15.52GB reserved=61.80GB peak=56.95GB
|
| 226 |
+
[MEM] student_update: after rollout: alloc=65.05GB reserved=68.04GB peak=66.83GB
|
| 227 |
+
[MEM] student_update: after perturb: alloc=65.07GB reserved=68.04GB peak=66.83GB
|
| 228 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=65.82GB peak=66.83GB
|
| 229 |
+
[MEM-fwd] after block 0: alloc=65.87GB peak=76.28GB
|
| 230 |
+
[MEM-fwd] after block 10: alloc=66.40GB peak=76.81GB
|
| 231 |
+
[MEM-fwd] after block 20: alloc=66.40GB peak=76.81GB
|
| 232 |
+
[MEM-fwd] after block 29: alloc=66.40GB peak=76.81GB
|
| 233 |
+
[MEM-fwd] after head+unpatchify: alloc=65.90GB peak=76.81GB
|
| 234 |
+
[MEM] student_update: after fake_score: alloc=65.15GB reserved=79.81GB peak=76.81GB
|
| 235 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.08GB peak=76.81GB
|
| 236 |
+
[MEM-fwd] after block 0: alloc=67.78GB peak=89.62GB
|
| 237 |
+
[MEM-fwd] after block 10: alloc=69.54GB peak=91.39GB
|
| 238 |
+
[MEM-fwd] after block 20: alloc=69.54GB peak=91.39GB
|
| 239 |
+
[MEM-fwd] after block 30: alloc=69.54GB peak=91.39GB
|
| 240 |
+
[MEM-fwd] after block 39: alloc=69.54GB peak=91.39GB
|
| 241 |
+
[MEM-fwd] after head+unpatchify: alloc=67.80GB peak=91.39GB
|
| 242 |
+
[MEM] student_update: after teacher: alloc=65.17GB reserved=94.89GB peak=91.39GB
|
| 243 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.10GB peak=91.39GB
|
| 244 |
+
[MEM-fwd] after block 0: alloc=67.80GB peak=91.39GB
|
| 245 |
+
[MEM-fwd] after block 10: alloc=69.57GB peak=91.41GB
|
| 246 |
+
[MEM-fwd] after block 20: alloc=69.57GB peak=91.41GB
|
| 247 |
+
[MEM-fwd] after block 30: alloc=69.57GB peak=91.41GB
|
| 248 |
+
[MEM-fwd] after block 39: alloc=69.57GB peak=91.41GB
|
| 249 |
+
[MEM-fwd] after head+unpatchify: alloc=67.83GB peak=91.41GB
|
| 250 |
+
[MEM] student_update: after CFG: alloc=65.17GB reserved=94.91GB peak=91.41GB
|
| 251 |
+
[MEM] student_update: after VSD loss: alloc=65.20GB reserved=94.91GB peak=91.41GB
|
| 252 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 253 |
+
Avg Max Min
|
| 254 |
+
cpu_mem_gb 38.678225 38.757980 38.605282
|
| 255 |
+
peak_gpu_mem_gb 85.130531 85.130531 85.130531
|
| 256 |
+
peak_gpu_mem_reserved_gb 90.766602 90.771484 90.751953
|
| 257 |
+
util 67.500000 72.000000 61.000000
|
| 258 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 5 : avg iteration time 193.31 seconds
|
| 259 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 5 : data loading time 0.00
|
| 260 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 5 : avg forward pass time 86.26
|
| 261 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 5 : backward pass time 10.03
|
| 262 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 5 : optimizer step time 0.31
|
| 263 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 5--------------------
|
| 264 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.6670 iter count: 1.0
|
| 265 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.6670 iter count: 1.0
|
| 266 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
|
| 267 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0075 iter count: 1.0
|
| 268 |
+
[Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 269 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=97.44GB peak=91.41GB
|
| 270 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.26GB reserved=53.38GB peak=49.73GB
|
| 271 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.03GB peak=49.73GB
|
| 272 |
+
[MEM-fwd] after block 0: alloc=17.66GB peak=49.73GB
|
| 273 |
+
[MEM-fwd] after block 10: alloc=23.48GB peak=49.73GB
|
| 274 |
+
[MEM-fwd] after block 20: alloc=28.76GB peak=49.73GB
|
| 275 |
+
[MEM-fwd] after block 29: alloc=33.52GB peak=49.73GB
|
| 276 |
+
[MEM-fwd] after head+unpatchify: alloc=34.60GB peak=49.73GB
|
| 277 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=62.77GB peak=58.43GB
|
| 278 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
|
| 279 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 280 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 281 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 282 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 283 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 284 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 285 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 286 |
+
Avg Max Min
|
| 287 |
+
cpu_mem_gb 38.678251 38.758018 38.605293
|
| 288 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 289 |
+
peak_gpu_mem_reserved_gb 57.687500 58.478516 55.314453
|
| 290 |
+
util 98.500000 100.000000 96.000000
|
| 291 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 6 : avg iteration time 52.27 seconds
|
| 292 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 6 : data loading time 0.00
|
| 293 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 6 : avg forward pass time 14.22
|
| 294 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 6 : backward pass time 11.33
|
| 295 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 6 : optimizer step time 1.17
|
| 296 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 6--------------------
|
| 297 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
|
| 298 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
|
| 299 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 300 |
+
[Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 301 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.39GB peak=58.45GB
|
| 302 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.46GB peak=48.91GB
|
| 303 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 304 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 305 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 306 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 307 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 308 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 309 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.79GB peak=57.66GB
|
| 310 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
|
| 311 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 312 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 313 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 314 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 315 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 316 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 317 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 318 |
+
Avg Max Min
|
| 319 |
+
cpu_mem_gb 38.678332 38.758266 38.605301
|
| 320 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 321 |
+
peak_gpu_mem_reserved_gb 57.311523 57.853516 55.685547
|
| 322 |
+
util 97.250000 100.000000 92.000000
|
| 323 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 7 : avg iteration time 57.95 seconds
|
| 324 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 7 : data loading time 0.00
|
| 325 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 7 : avg forward pass time 17.05
|
| 326 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 7 : backward pass time 11.35
|
| 327 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 7 : optimizer step time 1.17
|
| 328 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 7--------------------
|
| 329 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0071 iter count: 1.0
|
| 330 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
|
| 331 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 332 |
+
[Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 333 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.79GB peak=58.45GB
|
| 334 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
|
| 335 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 336 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 337 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 338 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 339 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 340 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 341 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.71GB peak=57.66GB
|
| 342 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
|
| 343 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 344 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 345 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 346 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 347 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 348 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 349 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 350 |
+
Avg Max Min
|
| 351 |
+
cpu_mem_gb 38.678370 38.758301 38.605301
|
| 352 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 353 |
+
peak_gpu_mem_reserved_gb 55.509766 55.685547 55.451172
|
| 354 |
+
util 97.500000 100.000000 95.000000
|
| 355 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 8 : avg iteration time 57.73 seconds
|
| 356 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 8 : data loading time 0.00
|
| 357 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 8 : avg forward pass time 16.95
|
| 358 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 8 : backward pass time 11.31
|
| 359 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 8 : optimizer step time 1.18
|
| 360 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 8--------------------
|
| 361 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0067 iter count: 1.0
|
| 362 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0067 iter count: 1.0
|
| 363 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 364 |
+
[Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 365 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.79GB peak=58.45GB
|
| 366 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.48GB peak=48.91GB
|
| 367 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 368 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 369 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 370 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 371 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 372 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 373 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.81GB peak=57.66GB
|
| 374 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.30GB peak=49.69GB
|
| 375 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 376 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 377 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 378 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 379 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 380 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 381 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 382 |
+
Avg Max Min
|
| 383 |
+
cpu_mem_gb 38.678534 38.758778 38.605354
|
| 384 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 385 |
+
peak_gpu_mem_reserved_gb 55.543945 55.705078 55.490234
|
| 386 |
+
util 96.750000 99.000000 94.000000
|
| 387 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 9 : avg iteration time 61.18 seconds
|
| 388 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 9 : data loading time 0.00
|
| 389 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 9 : avg forward pass time 18.67
|
| 390 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 9 : backward pass time 11.33
|
| 391 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 9 : optimizer step time 1.18
|
| 392 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 9--------------------
|
| 393 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0071 iter count: 1.0
|
| 394 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
|
| 395 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 396 |
+
[Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 397 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.81GB peak=58.45GB
|
| 398 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
|
| 399 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 400 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 401 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 402 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 403 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 404 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 405 |
+
[MEM] student_update: START: alloc=16.30GB reserved=59.83GB peak=57.72GB
|
| 406 |
+
[MEM] student_update: after rollout: alloc=65.82GB reserved=68.23GB peak=67.60GB
|
| 407 |
+
[MEM] student_update: after perturb: alloc=65.85GB reserved=68.23GB peak=67.60GB
|
| 408 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=66.60GB peak=67.60GB
|
| 409 |
+
[MEM-fwd] after block 0: alloc=66.64GB peak=77.05GB
|
| 410 |
+
[MEM-fwd] after block 10: alloc=67.17GB peak=77.58GB
|
| 411 |
+
[MEM-fwd] after block 20: alloc=67.17GB peak=77.58GB
|
| 412 |
+
[MEM-fwd] after block 29: alloc=67.17GB peak=77.58GB
|
| 413 |
+
[MEM-fwd] after head+unpatchify: alloc=66.67GB peak=77.58GB
|
| 414 |
+
[MEM] student_update: after fake_score: alloc=65.92GB reserved=79.54GB peak=77.58GB
|
| 415 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.85GB peak=77.58GB
|
| 416 |
+
[MEM-fwd] after block 0: alloc=68.55GB peak=90.40GB
|
| 417 |
+
[MEM-fwd] after block 10: alloc=70.31GB peak=92.16GB
|
| 418 |
+
[MEM-fwd] after block 20: alloc=70.31GB peak=92.16GB
|
| 419 |
+
[MEM-fwd] after block 30: alloc=70.31GB peak=92.16GB
|
| 420 |
+
[MEM-fwd] after block 39: alloc=70.31GB peak=92.16GB
|
| 421 |
+
[MEM-fwd] after head+unpatchify: alloc=68.58GB peak=92.16GB
|
| 422 |
+
[MEM] student_update: after teacher: alloc=65.95GB reserved=95.14GB peak=92.16GB
|
| 423 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.87GB peak=92.16GB
|
| 424 |
+
[MEM-fwd] after block 0: alloc=68.58GB peak=92.16GB
|
| 425 |
+
[MEM-fwd] after block 10: alloc=70.34GB peak=92.18GB
|
| 426 |
+
[MEM-fwd] after block 20: alloc=70.34GB peak=92.18GB
|
| 427 |
+
[MEM-fwd] after block 30: alloc=70.34GB peak=92.18GB
|
| 428 |
+
[MEM-fwd] after block 39: alloc=70.34GB peak=92.18GB
|
| 429 |
+
[MEM-fwd] after head+unpatchify: alloc=68.60GB peak=92.18GB
|
| 430 |
+
[MEM] student_update: after CFG: alloc=65.95GB reserved=95.16GB peak=92.18GB
|
| 431 |
+
[MEM] student_update: after VSD loss: alloc=65.97GB reserved=95.16GB peak=92.18GB
|
| 432 |
+
[MEM] fake_score_update: START: alloc=14.18GB reserved=98.71GB peak=92.18GB
|
| 433 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=17.01GB reserved=54.14GB peak=50.47GB
|
| 434 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.78GB peak=50.47GB
|
| 435 |
+
[MEM-fwd] after block 0: alloc=18.41GB peak=50.47GB
|
| 436 |
+
[MEM-fwd] after block 10: alloc=24.23GB peak=50.47GB
|
| 437 |
+
[MEM-fwd] after block 20: alloc=29.51GB peak=50.47GB
|
| 438 |
+
[MEM-fwd] after block 29: alloc=34.27GB peak=50.47GB
|
| 439 |
+
[MEM-fwd] after head+unpatchify: alloc=35.35GB peak=50.47GB
|
| 440 |
+
[MEM] student_update: START: alloc=17.02GB reserved=66.65GB peak=58.45GB
|
| 441 |
+
[MEM] student_update: after rollout: alloc=66.55GB reserved=69.49GB peak=68.33GB
|
| 442 |
+
[MEM] student_update: after perturb: alloc=66.57GB reserved=69.49GB peak=68.33GB
|
| 443 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=67.32GB peak=68.33GB
|
| 444 |
+
[MEM-fwd] after block 0: alloc=67.37GB peak=77.78GB
|
| 445 |
+
[MEM-fwd] after block 10: alloc=67.90GB peak=78.31GB
|
| 446 |
+
[MEM-fwd] after block 20: alloc=67.90GB peak=78.31GB
|
| 447 |
+
[MEM-fwd] after block 29: alloc=67.90GB peak=78.31GB
|
| 448 |
+
[MEM-fwd] after head+unpatchify: alloc=67.39GB peak=78.31GB
|
| 449 |
+
[MEM] student_update: after fake_score: alloc=66.65GB reserved=81.28GB peak=78.31GB
|
| 450 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.58GB peak=78.31GB
|
| 451 |
+
[MEM-fwd] after block 0: alloc=69.28GB peak=91.12GB
|
| 452 |
+
[MEM-fwd] after block 10: alloc=71.04GB peak=92.88GB
|
| 453 |
+
[MEM-fwd] after block 20: alloc=71.04GB peak=92.88GB
|
| 454 |
+
[MEM-fwd] after block 30: alloc=71.04GB peak=92.88GB
|
| 455 |
+
[MEM-fwd] after block 39: alloc=71.04GB peak=92.88GB
|
| 456 |
+
[MEM-fwd] after head+unpatchify: alloc=69.30GB peak=92.88GB
|
| 457 |
+
[MEM] student_update: after teacher: alloc=66.67GB reserved=96.36GB peak=92.88GB
|
| 458 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.60GB peak=92.88GB
|
| 459 |
+
[MEM-fwd] after block 0: alloc=69.30GB peak=92.88GB
|
| 460 |
+
[MEM-fwd] after block 10: alloc=71.06GB peak=92.91GB
|
| 461 |
+
[MEM-fwd] after block 20: alloc=71.06GB peak=92.91GB
|
| 462 |
+
[MEM-fwd] after block 30: alloc=71.06GB peak=92.91GB
|
| 463 |
+
[MEM-fwd] after block 39: alloc=71.06GB peak=92.91GB
|
| 464 |
+
[MEM-fwd] after head+unpatchify: alloc=69.32GB peak=92.91GB
|
| 465 |
+
[MEM] student_update: after CFG: alloc=66.67GB reserved=96.36GB peak=92.91GB
|
| 466 |
+
[MEM] student_update: after VSD loss: alloc=66.70GB reserved=96.36GB peak=92.91GB
|
| 467 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 468 |
+
Avg Max Min
|
| 469 |
+
cpu_mem_gb 38.679560 38.760101 38.606266
|
| 470 |
+
peak_gpu_mem_gb 86.525435 86.526208 86.525178
|
| 471 |
+
peak_gpu_mem_reserved_gb 92.133301 92.142578 92.105469
|
| 472 |
+
util 78.750000 83.000000 73.000000
|
| 473 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 10 : avg iteration time 185.69 seconds
|
| 474 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 10 : data loading time 0.00
|
| 475 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 10 : avg forward pass time 82.75
|
| 476 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 10 : backward pass time 10.03
|
| 477 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 10 : optimizer step time 0.21
|
| 478 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 10--------------------
|
| 479 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.7031 iter count: 1.0
|
| 480 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.7031 iter count: 1.0
|
| 481 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
|
| 482 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
|
| 483 |
+
[Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 484 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=98.90GB peak=92.91GB
|
| 485 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.26GB reserved=53.40GB peak=49.73GB
|
| 486 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.03GB peak=49.73GB
|
| 487 |
+
[MEM-fwd] after block 0: alloc=17.66GB peak=49.73GB
|
| 488 |
+
[MEM-fwd] after block 10: alloc=23.48GB peak=49.73GB
|
| 489 |
+
[MEM-fwd] after block 20: alloc=28.76GB peak=49.73GB
|
| 490 |
+
[MEM-fwd] after block 29: alloc=33.52GB peak=49.73GB
|
| 491 |
+
[MEM-fwd] after head+unpatchify: alloc=34.60GB peak=49.73GB
|
| 492 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=62.56GB peak=58.43GB
|
| 493 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.36GB peak=49.69GB
|
| 494 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 495 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 496 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 497 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 498 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 499 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 500 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 501 |
+
Avg Max Min
|
| 502 |
+
cpu_mem_gb 38.679598 38.760120 38.606396
|
| 503 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 504 |
+
peak_gpu_mem_reserved_gb 55.583008 55.802734 55.509766
|
| 505 |
+
util 97.250000 100.000000 91.000000
|
| 506 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 11 : avg iteration time 58.86 seconds
|
| 507 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 11 : data loading time 0.00
|
| 508 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 11 : avg forward pass time 17.51
|
| 509 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 11 : backward pass time 11.32
|
| 510 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 11 : optimizer step time 1.20
|
| 511 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 11--------------------
|
| 512 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0087 iter count: 1.0
|
| 513 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0087 iter count: 1.0
|
| 514 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 515 |
+
[Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 516 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.92GB peak=58.45GB
|
| 517 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.52GB peak=48.91GB
|
| 518 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 519 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 520 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 521 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 522 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 523 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 524 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.44GB peak=57.66GB
|
| 525 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.30GB peak=49.69GB
|
| 526 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 527 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 528 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 529 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 530 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 531 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 532 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 533 |
+
Avg Max Min
|
| 534 |
+
cpu_mem_gb 38.679661 38.760372 38.606403
|
| 535 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 536 |
+
peak_gpu_mem_reserved_gb 55.514648 55.568359 55.353516
|
| 537 |
+
util 97.250000 100.000000 93.000000
|
| 538 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 12 : avg iteration time 54.55 seconds
|
| 539 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 12 : data loading time 0.00
|
| 540 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 12 : avg forward pass time 15.35
|
| 541 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 12 : backward pass time 11.34
|
| 542 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 12 : optimizer step time 1.17
|
| 543 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 12--------------------
|
| 544 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
|
| 545 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
|
| 546 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 547 |
+
[Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 548 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.44GB peak=58.45GB
|
| 549 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.52GB peak=48.90GB
|
| 550 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.90GB
|
| 551 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.90GB
|
| 552 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.90GB
|
| 553 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.90GB
|
| 554 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.90GB
|
| 555 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.90GB
|
| 556 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.46GB peak=57.66GB
|
| 557 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.02GB peak=49.69GB
|
| 558 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 559 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 560 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 561 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 562 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 563 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 564 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 565 |
+
Avg Max Min
|
| 566 |
+
cpu_mem_gb 38.679665 38.760372 38.606419
|
| 567 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 568 |
+
peak_gpu_mem_reserved_gb 57.541016 58.224609 55.490234
|
| 569 |
+
util 97.750000 99.000000 94.000000
|
| 570 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 13 : avg iteration time 48.16 seconds
|
| 571 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 13 : data loading time 0.00
|
| 572 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 13 : avg forward pass time 12.17
|
| 573 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 13 : backward pass time 11.32
|
| 574 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 13 : optimizer step time 1.17
|
| 575 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 13--------------------
|
| 576 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
|
| 577 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
|
| 578 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 579 |
+
[Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 580 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.58GB peak=58.45GB
|
| 581 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.54GB peak=48.91GB
|
| 582 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 583 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 584 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 585 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 586 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 587 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 588 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.44GB peak=57.66GB
|
| 589 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.32GB peak=49.69GB
|
| 590 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
|
| 591 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
|
| 592 |
+
[MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
|
| 593 |
+
[MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
|
| 594 |
+
[MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
|
| 595 |
+
[MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
|
| 596 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 597 |
+
Avg Max Min
|
| 598 |
+
cpu_mem_gb 38.679850 38.761127 38.606380
|
| 599 |
+
peak_gpu_mem_gb 54.432534 54.433307 54.432277
|
| 600 |
+
peak_gpu_mem_reserved_gb 55.739258 55.744141 55.724609
|
| 601 |
+
util 98.000000 100.000000 94.000000
|
| 602 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 14 : avg iteration time 57.85 seconds
|
| 603 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 14 : data loading time 0.00
|
| 604 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 14 : avg forward pass time 17.00
|
| 605 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 14 : backward pass time 11.34
|
| 606 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 14 : optimizer step time 1.18
|
| 607 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 14--------------------
|
| 608 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0077 iter count: 1.0
|
| 609 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0077 iter count: 1.0
|
| 610 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 611 |
+
[Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 612 |
+
[MEM] fake_score_update: START: alloc=15.40GB reserved=59.83GB peak=58.45GB
|
| 613 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
|
| 614 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
|
| 615 |
+
[MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
|
| 616 |
+
[MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
|
| 617 |
+
[MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
|
| 618 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
|
| 619 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
|
| 620 |
+
[MEM] student_update: START: alloc=16.30GB reserved=59.44GB peak=57.72GB
|
| 621 |
+
[MEM] student_update: after rollout: alloc=65.84GB reserved=68.21GB peak=67.61GB
|
| 622 |
+
[MEM] student_update: after perturb: alloc=65.86GB reserved=68.21GB peak=67.61GB
|
| 623 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=66.61GB peak=67.61GB
|
| 624 |
+
[MEM-fwd] after block 0: alloc=66.66GB peak=77.07GB
|
| 625 |
+
[MEM-fwd] after block 10: alloc=67.19GB peak=77.60GB
|
| 626 |
+
[MEM-fwd] after block 20: alloc=67.19GB peak=77.60GB
|
| 627 |
+
[MEM-fwd] after block 29: alloc=67.19GB peak=77.60GB
|
| 628 |
+
[MEM-fwd] after head+unpatchify: alloc=66.68GB peak=77.60GB
|
| 629 |
+
[MEM] student_update: after fake_score: alloc=65.94GB reserved=80.02GB peak=77.60GB
|
| 630 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.86GB peak=77.60GB
|
| 631 |
+
[MEM-fwd] after block 0: alloc=68.57GB peak=90.41GB
|
| 632 |
+
[MEM-fwd] after block 10: alloc=70.33GB peak=92.17GB
|
| 633 |
+
[MEM-fwd] after block 20: alloc=70.33GB peak=92.17GB
|
| 634 |
+
[MEM-fwd] after block 30: alloc=70.33GB peak=92.17GB
|
| 635 |
+
[MEM-fwd] after block 39: alloc=70.33GB peak=92.17GB
|
| 636 |
+
[MEM-fwd] after head+unpatchify: alloc=68.59GB peak=92.17GB
|
| 637 |
+
[MEM] student_update: after teacher: alloc=65.96GB reserved=95.12GB peak=92.17GB
|
| 638 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.89GB peak=92.17GB
|
| 639 |
+
[MEM-fwd] after block 0: alloc=68.59GB peak=92.17GB
|
| 640 |
+
[MEM-fwd] after block 10: alloc=70.35GB peak=92.19GB
|
| 641 |
+
[MEM-fwd] after block 20: alloc=70.35GB peak=92.19GB
|
| 642 |
+
[MEM-fwd] after block 30: alloc=70.35GB peak=92.19GB
|
| 643 |
+
[MEM-fwd] after block 39: alloc=70.35GB peak=92.19GB
|
| 644 |
+
[MEM-fwd] after head+unpatchify: alloc=68.61GB peak=92.19GB
|
| 645 |
+
[MEM] student_update: after CFG: alloc=65.96GB reserved=95.12GB peak=92.19GB
|
| 646 |
+
[MEM] student_update: after VSD loss: alloc=65.98GB reserved=95.12GB peak=92.19GB
|
| 647 |
+
[MEM] fake_score_update: START: alloc=14.18GB reserved=98.66GB peak=92.19GB
|
| 648 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=17.02GB reserved=54.16GB peak=50.48GB
|
| 649 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.78GB peak=50.48GB
|
| 650 |
+
[MEM-fwd] after block 0: alloc=18.42GB peak=50.48GB
|
| 651 |
+
[MEM-fwd] after block 10: alloc=24.23GB peak=50.48GB
|
| 652 |
+
[MEM-fwd] after block 20: alloc=29.52GB peak=50.48GB
|
| 653 |
+
[MEM-fwd] after block 29: alloc=34.27GB peak=50.48GB
|
| 654 |
+
[MEM-fwd] after head+unpatchify: alloc=35.35GB peak=50.48GB
|
| 655 |
+
[MEM] student_update: START: alloc=17.03GB reserved=63.28GB peak=58.45GB
|
| 656 |
+
[MEM] student_update: after rollout: alloc=66.57GB reserved=69.53GB peak=68.34GB
|
| 657 |
+
[MEM] student_update: after perturb: alloc=66.59GB reserved=69.53GB peak=68.34GB
|
| 658 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=67.34GB peak=68.34GB
|
| 659 |
+
[MEM-fwd] after block 0: alloc=67.39GB peak=77.80GB
|
| 660 |
+
[MEM-fwd] after block 10: alloc=67.92GB peak=78.33GB
|
| 661 |
+
[MEM-fwd] after block 20: alloc=67.92GB peak=78.33GB
|
| 662 |
+
[MEM-fwd] after block 29: alloc=67.92GB peak=78.33GB
|
| 663 |
+
[MEM-fwd] after head+unpatchify: alloc=67.41GB peak=78.33GB
|
| 664 |
+
[MEM] student_update: after fake_score: alloc=66.67GB reserved=80.78GB peak=78.33GB
|
| 665 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.60GB peak=78.33GB
|
| 666 |
+
[MEM-fwd] after block 0: alloc=69.30GB peak=91.14GB
|
| 667 |
+
[MEM-fwd] after block 10: alloc=71.06GB peak=92.90GB
|
| 668 |
+
[MEM-fwd] after block 20: alloc=71.06GB peak=92.90GB
|
| 669 |
+
[MEM-fwd] after block 30: alloc=71.06GB peak=92.90GB
|
| 670 |
+
[MEM-fwd] after block 39: alloc=71.06GB peak=92.90GB
|
| 671 |
+
[MEM-fwd] after head+unpatchify: alloc=69.32GB peak=92.90GB
|
| 672 |
+
[MEM] student_update: after teacher: alloc=66.69GB reserved=96.38GB peak=92.90GB
|
| 673 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.62GB peak=92.90GB
|
| 674 |
+
[MEM-fwd] after block 0: alloc=69.32GB peak=92.90GB
|
| 675 |
+
[MEM-fwd] after block 10: alloc=71.08GB peak=92.93GB
|
| 676 |
+
[MEM-fwd] after block 20: alloc=71.08GB peak=92.93GB
|
| 677 |
+
[MEM-fwd] after block 30: alloc=71.08GB peak=92.93GB
|
| 678 |
+
[MEM-fwd] after block 39: alloc=71.08GB peak=92.93GB
|
| 679 |
+
[MEM-fwd] after head+unpatchify: alloc=69.34GB peak=92.93GB
|
| 680 |
+
[MEM] student_update: after CFG: alloc=66.69GB reserved=96.42GB peak=92.93GB
|
| 681 |
+
[MEM] student_update: after VSD loss: alloc=66.71GB reserved=96.42GB peak=92.93GB
|
| 682 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 683 |
+
Avg Max Min
|
| 684 |
+
cpu_mem_gb 38.679507 38.759026 38.606628
|
| 685 |
+
peak_gpu_mem_gb 86.542802 86.543575 86.542545
|
| 686 |
+
peak_gpu_mem_reserved_gb 92.208496 92.222656 92.166016
|
| 687 |
+
util 77.500000 82.000000 71.000000
|
| 688 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 15 : avg iteration time 182.57 seconds
|
| 689 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 15 : data loading time 0.00
|
| 690 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 15 : avg forward pass time 81.19
|
| 691 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 15 : backward pass time 10.03
|
| 692 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 15 : optimizer step time 0.22
|
| 693 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 15--------------------
|
| 694 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.6475 iter count: 1.0
|
| 695 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.6475 iter count: 1.0
|
| 696 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
|
| 697 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0067 iter count: 1.0
|
| 698 |
+
[Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 699 |
+
[MEM] fake_score_update: START: alloc=13.44GB reserved=98.96GB peak=92.93GB
|
| 700 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.27GB reserved=53.40GB peak=49.73GB
|
| 701 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.04GB peak=49.73GB
|
| 702 |
+
[MEM-fwd] after block 0: alloc=17.67GB peak=49.73GB
|
| 703 |
+
[MEM-fwd] after block 10: alloc=23.49GB peak=49.73GB
|
| 704 |
+
[MEM-fwd] after block 20: alloc=28.77GB peak=49.73GB
|
| 705 |
+
[MEM-fwd] after block 29: alloc=33.53GB peak=49.73GB
|
| 706 |
+
[MEM-fwd] after head+unpatchify: alloc=34.61GB peak=49.73GB
|
| 707 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=62.56GB peak=58.43GB
|
| 708 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=16.24GB reserved=50.28GB peak=49.70GB
|
| 709 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.12GB peak=49.70GB
|
| 710 |
+
[MEM-fwd] after block 0: alloc=17.69GB peak=49.70GB
|
| 711 |
+
[MEM-fwd] after block 10: alloc=23.51GB peak=49.70GB
|
| 712 |
+
[MEM-fwd] after block 20: alloc=28.79GB peak=49.70GB
|
| 713 |
+
[MEM-fwd] after block 29: alloc=33.55GB peak=49.70GB
|
| 714 |
+
[MEM-fwd] after head+unpatchify: alloc=34.63GB peak=49.70GB
|
| 715 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 716 |
+
Avg Max Min
|
| 717 |
+
cpu_mem_gb 38.679513 38.759029 38.606636
|
| 718 |
+
peak_gpu_mem_gb 54.440308 54.441080 54.440050
|
| 719 |
+
peak_gpu_mem_reserved_gb 57.801758 58.539062 55.589844
|
| 720 |
+
util 98.500000 100.000000 95.000000
|
| 721 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 16 : avg iteration time 58.84 seconds
|
| 722 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 16 : data loading time 0.00
|
| 723 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 16 : avg forward pass time 17.46
|
| 724 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 16 : backward pass time 11.37
|
| 725 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 16 : optimizer step time 1.18
|
| 726 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 16--------------------
|
| 727 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0066 iter count: 1.0
|
| 728 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0066 iter count: 1.0
|
| 729 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 730 |
+
[Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 731 |
+
[MEM] fake_score_update: START: alloc=15.41GB reserved=59.69GB peak=58.46GB
|
| 732 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.45GB reserved=49.55GB peak=48.92GB
|
| 733 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.33GB peak=48.92GB
|
| 734 |
+
[MEM-fwd] after block 0: alloc=16.91GB peak=48.92GB
|
| 735 |
+
[MEM-fwd] after block 10: alloc=22.72GB peak=48.92GB
|
| 736 |
+
[MEM-fwd] after block 20: alloc=28.01GB peak=48.92GB
|
| 737 |
+
[MEM-fwd] after block 29: alloc=32.76GB peak=48.92GB
|
| 738 |
+
[MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.92GB
|
| 739 |
+
[MEM] fake_score_update: START: alloc=16.19GB reserved=59.46GB peak=57.67GB
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/requirements.txt
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastgen==0.1.0
|
| 2 |
+
nvitop==1.6.1
|
| 3 |
+
ftfy==6.3.1
|
| 4 |
+
braceexpand==0.1.7
|
| 5 |
+
antlr4-python3-runtime==4.9.3
|
| 6 |
+
webdataset==1.0.2
|
| 7 |
+
sentry-sdk==2.53.0
|
| 8 |
+
rdkit==2025.9.5
|
| 9 |
+
python-dotenv==1.2.1
|
| 10 |
+
proglog==0.1.12
|
| 11 |
+
omegaconf==2.3.0
|
| 12 |
+
narwhals==2.17.0
|
| 13 |
+
loguru==0.7.3
|
| 14 |
+
imageio-ffmpeg==0.6.0
|
| 15 |
+
plotly==6.5.2
|
| 16 |
+
moviepy==2.2.1
|
| 17 |
+
hydra-core==1.3.2
|
| 18 |
+
wandb==0.25.0
|
| 19 |
+
fastgen==0.1.0
|
| 20 |
+
packaging==25.0
|
| 21 |
+
setuptools==80.10.2
|
| 22 |
+
wheel==0.46.3
|
| 23 |
+
pip==26.0.1
|
| 24 |
+
webencodings==0.5.1
|
| 25 |
+
pure_eval==0.2.3
|
| 26 |
+
ptyprocess==0.7.0
|
| 27 |
+
nvidia-ml-py==13.590.48
|
| 28 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 29 |
+
mpmath==1.3.0
|
| 30 |
+
fastjsonschema==2.21.2
|
| 31 |
+
zipp==3.23.0
|
| 32 |
+
xyzservices==2025.11.0
|
| 33 |
+
widgetsnbextension==4.0.15
|
| 34 |
+
websocket-client==1.9.0
|
| 35 |
+
webcolors==25.10.0
|
| 36 |
+
wcwidth==0.6.0
|
| 37 |
+
urllib3==2.6.3
|
| 38 |
+
uri-template==1.3.0
|
| 39 |
+
tzdata==2025.3
|
| 40 |
+
typing_extensions==4.15.0
|
| 41 |
+
triton==3.6.0
|
| 42 |
+
traitlets==5.14.3
|
| 43 |
+
tqdm==4.67.3
|
| 44 |
+
tornado==6.5.5
|
| 45 |
+
tinycss2==1.4.0
|
| 46 |
+
sympy==1.14.0
|
| 47 |
+
soupsieve==2.8.3
|
| 48 |
+
smmap==5.0.3
|
| 49 |
+
six==1.16.0
|
| 50 |
+
sentencepiece==0.2.1
|
| 51 |
+
Send2Trash==2.1.0
|
| 52 |
+
safetensors==0.7.0
|
| 53 |
+
rpds-py==0.30.0
|
| 54 |
+
rfc3986-validator==0.1.1
|
| 55 |
+
regex==2026.2.28
|
| 56 |
+
pyzmq==27.1.0
|
| 57 |
+
PyYAML==6.0.3
|
| 58 |
+
python-json-logger==4.0.0
|
| 59 |
+
Pygments==2.19.2
|
| 60 |
+
pycparser==3.0
|
| 61 |
+
psutil==7.2.2
|
| 62 |
+
protobuf==4.24.4
|
| 63 |
+
prometheus_client==0.24.1
|
| 64 |
+
platformdirs==4.9.4
|
| 65 |
+
pillow==11.3.0
|
| 66 |
+
pexpect==4.9.0
|
| 67 |
+
parso==0.8.6
|
| 68 |
+
pandocfilters==1.5.1
|
| 69 |
+
nvidia-nvtx-cu12==12.8.90
|
| 70 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 71 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 72 |
+
nvidia-nccl-cu12==2.27.5
|
| 73 |
+
nvidia-curand-cu12==10.3.9.90
|
| 74 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 76 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 77 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 78 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 79 |
+
numpy==1.26.4
|
| 80 |
+
networkx==3.6.1
|
| 81 |
+
nest-asyncio==1.6.0
|
| 82 |
+
mistune==3.2.0
|
| 83 |
+
MarkupSafe==3.0.3
|
| 84 |
+
lark==1.3.1
|
| 85 |
+
jupyterlab_widgets==3.0.16
|
| 86 |
+
jupyterlab_pygments==0.3.0
|
| 87 |
+
jsonpointer==3.0.0
|
| 88 |
+
json5==0.13.0
|
| 89 |
+
jmespath==1.1.0
|
| 90 |
+
idna==3.11
|
| 91 |
+
hf-xet==1.4.2
|
| 92 |
+
h11==0.16.0
|
| 93 |
+
fsspec==2026.2.0
|
| 94 |
+
fqdn==1.5.1
|
| 95 |
+
filelock==3.25.2
|
| 96 |
+
executing==2.2.1
|
| 97 |
+
einops==0.8.2
|
| 98 |
+
defusedxml==0.7.1
|
| 99 |
+
decorator==5.2.1
|
| 100 |
+
debugpy==1.8.20
|
| 101 |
+
cuda-pathfinder==1.4.2
|
| 102 |
+
comm==0.2.3
|
| 103 |
+
click==8.3.1
|
| 104 |
+
charset-normalizer==3.4.5
|
| 105 |
+
certifi==2026.2.25
|
| 106 |
+
bleach==6.3.0
|
| 107 |
+
babel==2.18.0
|
| 108 |
+
av==17.0.0
|
| 109 |
+
attrs==25.4.0
|
| 110 |
+
async-lru==2.2.0
|
| 111 |
+
asttokens==3.0.1
|
| 112 |
+
annotated-types==0.7.0
|
| 113 |
+
typing-inspection==0.4.2
|
| 114 |
+
terminado==0.18.1
|
| 115 |
+
stack-data==0.6.3
|
| 116 |
+
scipy==1.17.1
|
| 117 |
+
rfc3987-syntax==1.1.0
|
| 118 |
+
rfc3339-validator==0.1.4
|
| 119 |
+
requests==2.32.5
|
| 120 |
+
referencing==0.37.0
|
| 121 |
+
python-dateutil==2.9.0.post0
|
| 122 |
+
pydantic_core==2.41.5
|
| 123 |
+
prompt_toolkit==3.0.52
|
| 124 |
+
opencv-python-headless==4.11.0.86
|
| 125 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 126 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 127 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 128 |
+
matplotlib-inline==0.2.1
|
| 129 |
+
jupyter_core==5.9.1
|
| 130 |
+
Jinja2==3.1.6
|
| 131 |
+
jedi==0.19.2
|
| 132 |
+
ipython_pygments_lexers==1.1.1
|
| 133 |
+
importlib_metadata==8.7.1
|
| 134 |
+
ImageIO==2.37.3
|
| 135 |
+
httpcore==1.0.9
|
| 136 |
+
gitdb==4.0.12
|
| 137 |
+
cuda-bindings==12.9.4
|
| 138 |
+
contourpy==1.3.3
|
| 139 |
+
cffi==2.0.0
|
| 140 |
+
beautifulsoup4==4.14.3
|
| 141 |
+
anyio==4.12.1
|
| 142 |
+
soundfile==0.13.1
|
| 143 |
+
pydantic==2.12.5
|
| 144 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 145 |
+
jupyter_server_terminals==0.5.4
|
| 146 |
+
jupyter_client==8.8.0
|
| 147 |
+
jsonschema-specifications==2025.9.1
|
| 148 |
+
ipython==9.11.0
|
| 149 |
+
httpx==0.28.1
|
| 150 |
+
GitPython==3.1.46
|
| 151 |
+
botocore==1.42.68
|
| 152 |
+
bokeh==3.9.0
|
| 153 |
+
arrow==1.4.0
|
| 154 |
+
argon2-cffi-bindings==25.1.0
|
| 155 |
+
torch==2.10.0
|
| 156 |
+
s3transfer==0.16.0
|
| 157 |
+
jsonschema==4.26.0
|
| 158 |
+
isoduration==20.11.0
|
| 159 |
+
ipywidgets==8.1.8
|
| 160 |
+
ipykernel==7.2.0
|
| 161 |
+
argon2-cffi==25.1.0
|
| 162 |
+
torchvision==0.25.0
|
| 163 |
+
nbformat==5.10.4
|
| 164 |
+
jupyter-console==6.6.3
|
| 165 |
+
boto3==1.42.68
|
| 166 |
+
accelerate==1.13.0
|
| 167 |
+
nbclient==0.10.4
|
| 168 |
+
jupyter-events==0.12.0
|
| 169 |
+
nbconvert==7.17.0
|
| 170 |
+
jupyter_server==2.17.0
|
| 171 |
+
notebook_shim==0.2.4
|
| 172 |
+
jupyterlab_server==2.28.0
|
| 173 |
+
jupyter-lsp==2.3.0
|
| 174 |
+
jupyterlab==4.5.6
|
| 175 |
+
notebook==7.5.5
|
| 176 |
+
jupyter==1.1.1
|
| 177 |
+
fastgen==0.1.0
|
| 178 |
+
pandas==3.0.1
|
| 179 |
+
shellingham==1.5.4
|
| 180 |
+
mdurl==0.1.2
|
| 181 |
+
annotated-doc==0.0.4
|
| 182 |
+
markdown-it-py==4.0.0
|
| 183 |
+
rich==14.3.3
|
| 184 |
+
typer==0.24.1
|
| 185 |
+
huggingface_hub==1.7.1
|
| 186 |
+
timm==1.0.25
|
| 187 |
+
tokenizers==0.22.2
|
| 188 |
+
diffusers==0.37.0
|
| 189 |
+
transformers==5.3.0
|
| 190 |
+
peft==0.18.1
|
| 191 |
+
easydict==1.13
|
| 192 |
+
lmdb==2.2.0
|
| 193 |
+
threadpoolctl==3.6.0
|
| 194 |
+
soxr==1.0.0
|
| 195 |
+
msgpack==1.1.2
|
| 196 |
+
llvmlite==0.47.0
|
| 197 |
+
lazy-loader==0.5
|
| 198 |
+
joblib==1.5.3
|
| 199 |
+
audioread==3.1.0
|
| 200 |
+
scikit-learn==1.8.0
|
| 201 |
+
pooch==1.9.0
|
| 202 |
+
numba==0.65.0
|
| 203 |
+
librosa==0.11.0
|
| 204 |
+
simsimd==6.5.16
|
| 205 |
+
flatbuffers==25.12.19
|
| 206 |
+
tifffile==2026.3.3
|
| 207 |
+
stringzilla==4.6.0
|
| 208 |
+
pyparsing==3.3.2
|
| 209 |
+
prettytable==3.17.0
|
| 210 |
+
onnx==1.17.0
|
| 211 |
+
kiwisolver==1.5.0
|
| 212 |
+
fonttools==4.62.1
|
| 213 |
+
Cython==3.2.4
|
| 214 |
+
cycler==0.12.1
|
| 215 |
+
scikit-image==0.26.0
|
| 216 |
+
onnxruntime==1.24.4
|
| 217 |
+
matplotlib==3.10.8
|
| 218 |
+
albucore==0.0.24
|
| 219 |
+
albumentations==2.0.8
|
| 220 |
+
insightface==0.7.3
|
| 221 |
+
kornia_rs==0.1.10
|
| 222 |
+
kornia==0.8.2
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
|
| 3 |
+
"python": "CPython 3.12.12",
|
| 4 |
+
"startedAt": "2026-04-02T15:01:32.168210Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
|
| 7 |
+
"-",
|
| 8 |
+
"trainer.resume=False",
|
| 9 |
+
"log_config.name=sf_4gpu_bs8_lr2e6_5000iter_shift5_combined",
|
| 10 |
+
"log_config.project=OmniAvatar-FastGen"
|
| 11 |
+
],
|
| 12 |
+
"program": "/home/work/.local/hyunbin/FastGen/train.py",
|
| 13 |
+
"codePath": "train.py",
|
| 14 |
+
"codePathLocal": "train.py",
|
| 15 |
+
"git": {
|
| 16 |
+
"remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
|
| 17 |
+
"commit": "04de80beaf50f849c12a55a5d8358d94530b7bb5"
|
| 18 |
+
},
|
| 19 |
+
"email": "paul.hyunbin@gmail.com",
|
| 20 |
+
"root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined",
|
| 21 |
+
"host": "main1",
|
| 22 |
+
"executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
|
| 23 |
+
"cpu_count": 112,
|
| 24 |
+
"cpu_count_logical": 224,
|
| 25 |
+
"gpu": "NVIDIA H200",
|
| 26 |
+
"gpu_count": 4,
|
| 27 |
+
"disk": {
|
| 28 |
+
"/": {
|
| 29 |
+
"total": "1356758433792",
|
| 30 |
+
"used": "270456766464"
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"memory": {
|
| 34 |
+
"total": "2163961778176"
|
| 35 |
+
},
|
| 36 |
+
"gpu_nvidia": [
|
| 37 |
+
{
|
| 38 |
+
"name": "NVIDIA H200",
|
| 39 |
+
"memoryTotal": "150754820096",
|
| 40 |
+
"cudaCores": 16896,
|
| 41 |
+
"architecture": "Hopper",
|
| 42 |
+
"uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "NVIDIA H200",
|
| 46 |
+
"memoryTotal": "150754820096",
|
| 47 |
+
"cudaCores": 16896,
|
| 48 |
+
"architecture": "Hopper",
|
| 49 |
+
"uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"name": "NVIDIA H200",
|
| 53 |
+
"memoryTotal": "150754820096",
|
| 54 |
+
"cudaCores": 16896,
|
| 55 |
+
"architecture": "Hopper",
|
| 56 |
+
"uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "NVIDIA H200",
|
| 60 |
+
"memoryTotal": "150754820096",
|
| 61 |
+
"cudaCores": 16896,
|
| 62 |
+
"architecture": "Hopper",
|
| 63 |
+
"uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"cudaVersion": "12.9",
|
| 67 |
+
"writerId": "wykcz6se3w95mxueg1dbfpdz1rkcn7vb"
|
| 68 |
+
}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-core.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-03T00:01:32.229543031+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpprztt27o/port-1047116.txt","pid":1047116,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-03T00:01:32.230033289+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1047116}
|
| 3 |
+
{"time":"2026-04-03T00:01:32.230022361+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1047116-1057160-4156700530/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-04-03T00:01:32.416224439+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-03T00:01:32.430551403+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"nkf4iovm","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-03T00:01:32.80024046+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"nkf4iovm","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-03T00:01:38.985900014+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"rtqkkeh0iczo"}
|
| 8 |
+
{"time":"2026-04-03T00:24:18.430668942+09:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-03T00:01:32.430639522+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-03T00:01:32.800089951+09:00","level":"INFO","msg":"stream: created new stream","id":"nkf4iovm"}
|
| 3 |
+
{"time":"2026-04-03T00:01:32.800139938+09:00","level":"INFO","msg":"handler: started","stream_id":"nkf4iovm"}
|
| 4 |
+
{"time":"2026-04-03T00:01:32.800233729+09:00","level":"INFO","msg":"stream: started","id":"nkf4iovm"}
|
| 5 |
+
{"time":"2026-04-03T00:01:32.80025365+09:00","level":"INFO","msg":"sender: started","stream_id":"nkf4iovm"}
|
| 6 |
+
{"time":"2026-04-03T00:01:32.800252986+09:00","level":"INFO","msg":"writer: started","stream_id":"nkf4iovm"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Configure stats pid to 1047116
|
| 3 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log
|
| 5 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log
|
| 6 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_lr2e6_5000iter_shift5_combined', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7f328019bce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-03 00:01:32,416 INFO MainThread:1047116 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-03 00:01:32,428 INFO MainThread:1047116 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-03 00:01:32,432 INFO MainThread:1047116 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-03 00:01:32,448 INFO MainThread:1047116 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-03 00:01:33,668 INFO MainThread:1047116 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-03 00:01:33,986 INFO MainThread:1047116 [wandb_init.py:init():1082] run started, returning control to user process
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/run-nkf4iovm.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67b0ff79a5dafcc07f84b00216e9f32cb24f03c12ace3669ef628f85ac889c1e
|
| 3 |
+
size 360448
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb_id.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
nkf4iovm
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7fb9c8423a60>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: online
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7fb9c867e660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7fb9c867e660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7fb9c867e660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '5000'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7f7c52b8fce0>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_combined_debug
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: disabled
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7f7c52bf2660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7f7c52bf2660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7f7c52bf2660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '10'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/wandb_id.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
k4ws77lt
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7f99973a7ce0>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_combined_step_test
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: online
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7f99975fe660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7f99975fe660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7f99975fe660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '20'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T22:45:31.396833745+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-02T22:45:31.746696076+09:00","level":"INFO","msg":"stream: created new stream","id":"spcd04xe"}
|
| 3 |
+
{"time":"2026-04-02T22:45:31.746744779+09:00","level":"INFO","msg":"handler: started","stream_id":"spcd04xe"}
|
| 4 |
+
{"time":"2026-04-02T22:45:31.746822827+09:00","level":"INFO","msg":"stream: started","id":"spcd04xe"}
|
| 5 |
+
{"time":"2026-04-02T22:45:31.746841154+09:00","level":"INFO","msg":"sender: started","stream_id":"spcd04xe"}
|
| 6 |
+
{"time":"2026-04-02T22:45:31.74684523+09:00","level":"INFO","msg":"writer: started","stream_id":"spcd04xe"}
|
| 7 |
+
{"time":"2026-04-02T22:52:57.91103952+09:00","level":"INFO","msg":"stream: closing","id":"spcd04xe"}
|
| 8 |
+
{"time":"2026-04-02T22:52:58.486184439+09:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-04-02T22:52:58.869727008+09:00","level":"INFO","msg":"handler: closed","stream_id":"spcd04xe"}
|
| 10 |
+
{"time":"2026-04-02T22:52:58.869816672+09:00","level":"INFO","msg":"sender: closed","stream_id":"spcd04xe"}
|
| 11 |
+
{"time":"2026-04-02T22:52:58.869827326+09:00","level":"INFO","msg":"stream: closed","id":"spcd04xe"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Configure stats pid to 792541
|
| 3 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log
|
| 5 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log
|
| 6 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_combined_step_test', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '20', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7ff2f56a3ce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-02 22:45:31,381 INFO MainThread:792541 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-02 22:45:31,394 INFO MainThread:792541 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-02 22:45:31,398 INFO MainThread:792541 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-02 22:45:31,413 INFO MainThread:792541 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-02 22:45:32,719 INFO MainThread:792541 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-02 22:45:32,998 INFO MainThread:792541 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [service_client.py:_forward_responses():134] Reached EOF.
|
| 21 |
+
2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/config.yaml
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.25.0
|
| 4 |
+
e:
|
| 5 |
+
mf7qlulwtbceq6fkw52thj4sgf25e3dz:
|
| 6 |
+
args:
|
| 7 |
+
- --config=fastgen/configs/experiments/OmniAvatar/config_sf.py
|
| 8 |
+
- '-'
|
| 9 |
+
- trainer.resume=False
|
| 10 |
+
- log_config.name=sf_combined_step_test
|
| 11 |
+
- log_config.project=OmniAvatar-FastGen
|
| 12 |
+
- trainer.max_iter=20
|
| 13 |
+
codePath: train.py
|
| 14 |
+
codePathLocal: train.py
|
| 15 |
+
cpu_count: 112
|
| 16 |
+
cpu_count_logical: 224
|
| 17 |
+
cudaVersion: "12.9"
|
| 18 |
+
disk:
|
| 19 |
+
/:
|
| 20 |
+
total: "1356758433792"
|
| 21 |
+
used: "257963536384"
|
| 22 |
+
email: paul.hyunbin@gmail.com
|
| 23 |
+
executable: /home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12
|
| 24 |
+
git:
|
| 25 |
+
commit: 04de80beaf50f849c12a55a5d8358d94530b7bb5
|
| 26 |
+
remote: https://paulcho98:@github.com/paulcho98/FastGen.git
|
| 27 |
+
gpu: NVIDIA H200
|
| 28 |
+
gpu_count: 4
|
| 29 |
+
gpu_nvidia:
|
| 30 |
+
- architecture: Hopper
|
| 31 |
+
cudaCores: 16896
|
| 32 |
+
memoryTotal: "150754820096"
|
| 33 |
+
name: NVIDIA H200
|
| 34 |
+
uuid: GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c
|
| 35 |
+
- architecture: Hopper
|
| 36 |
+
cudaCores: 16896
|
| 37 |
+
memoryTotal: "150754820096"
|
| 38 |
+
name: NVIDIA H200
|
| 39 |
+
uuid: GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262
|
| 40 |
+
- architecture: Hopper
|
| 41 |
+
cudaCores: 16896
|
| 42 |
+
memoryTotal: "150754820096"
|
| 43 |
+
name: NVIDIA H200
|
| 44 |
+
uuid: GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790
|
| 45 |
+
- architecture: Hopper
|
| 46 |
+
cudaCores: 16896
|
| 47 |
+
memoryTotal: "150754820096"
|
| 48 |
+
name: NVIDIA H200
|
| 49 |
+
uuid: GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0
|
| 50 |
+
host: main1
|
| 51 |
+
memory:
|
| 52 |
+
total: "2163961778176"
|
| 53 |
+
os: Linux-5.15.0-151-generic-x86_64-with-glibc2.39
|
| 54 |
+
program: /home/work/.local/hyunbin/FastGen/train.py
|
| 55 |
+
python: CPython 3.12.12
|
| 56 |
+
root: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test
|
| 57 |
+
startedAt: "2026-04-02T13:45:31.145420Z"
|
| 58 |
+
writerId: mf7qlulwtbceq6fkw52thj4sgf25e3dz
|
| 59 |
+
m: []
|
| 60 |
+
python_version: 3.12.12
|
| 61 |
+
t:
|
| 62 |
+
"1":
|
| 63 |
+
- 1
|
| 64 |
+
- 5
|
| 65 |
+
- 11
|
| 66 |
+
- 41
|
| 67 |
+
- 49
|
| 68 |
+
- 50
|
| 69 |
+
- 53
|
| 70 |
+
- 63
|
| 71 |
+
- 71
|
| 72 |
+
- 83
|
| 73 |
+
- 98
|
| 74 |
+
"2":
|
| 75 |
+
- 1
|
| 76 |
+
- 5
|
| 77 |
+
- 11
|
| 78 |
+
- 41
|
| 79 |
+
- 49
|
| 80 |
+
- 50
|
| 81 |
+
- 53
|
| 82 |
+
- 63
|
| 83 |
+
- 71
|
| 84 |
+
- 83
|
| 85 |
+
- 98
|
| 86 |
+
"3":
|
| 87 |
+
- 13
|
| 88 |
+
- 14
|
| 89 |
+
- 16
|
| 90 |
+
- 61
|
| 91 |
+
- 62
|
| 92 |
+
"4": 3.12.12
|
| 93 |
+
"5": 0.25.0
|
| 94 |
+
"6": 5.3.0
|
| 95 |
+
"12": 0.25.0
|
| 96 |
+
"13": linux-x86_64
|
| 97 |
+
dataloader_train:
|
| 98 |
+
value:
|
| 99 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 100 |
+
batch_size: "8"
|
| 101 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 102 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 103 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 104 |
+
num_workers: "2"
|
| 105 |
+
use_ref_sequence: "True"
|
| 106 |
+
dataloader_val:
|
| 107 |
+
value:
|
| 108 |
+
_target_: <function create_omniavatar_dataloader at 0x7ff2f56a3ce0>
|
| 109 |
+
batch_size: "1"
|
| 110 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 111 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 112 |
+
load_ode_path: "False"
|
| 113 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 114 |
+
num_workers: "2"
|
| 115 |
+
use_ref_sequence: "True"
|
| 116 |
+
eval:
|
| 117 |
+
value:
|
| 118 |
+
max_ckpt: "100000000"
|
| 119 |
+
min_ckpt: "0"
|
| 120 |
+
num_samples: "50000"
|
| 121 |
+
samples_dir: samples
|
| 122 |
+
save_images: "False"
|
| 123 |
+
log_config:
|
| 124 |
+
value:
|
| 125 |
+
group: omniavatar_sf
|
| 126 |
+
name: sf_combined_step_test
|
| 127 |
+
project: OmniAvatar-FastGen
|
| 128 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 129 |
+
wandb_entity: paulhcho
|
| 130 |
+
wandb_mode: online
|
| 131 |
+
model:
|
| 132 |
+
value:
|
| 133 |
+
add_teacher_to_fsdp_dict: "True"
|
| 134 |
+
context_noise: "0.0"
|
| 135 |
+
ddp_find_unused_parameters: "True"
|
| 136 |
+
device: cuda
|
| 137 |
+
discriminator:
|
| 138 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 139 |
+
all_res:
|
| 140 |
+
- "32"
|
| 141 |
+
- "16"
|
| 142 |
+
- "8"
|
| 143 |
+
feature_indices: '{0, 1, 2}'
|
| 144 |
+
in_channels: "256"
|
| 145 |
+
discriminator_optimizer:
|
| 146 |
+
_target_: <function get_optimizer at 0x7ff2f5916660>
|
| 147 |
+
betas:
|
| 148 |
+
- "0.9"
|
| 149 |
+
- "0.999"
|
| 150 |
+
eps: "1e-08"
|
| 151 |
+
fused: "False"
|
| 152 |
+
lr: "0.0001"
|
| 153 |
+
model: null
|
| 154 |
+
optim_type: adamw
|
| 155 |
+
weight_decay: "0.01"
|
| 156 |
+
discriminator_scheduler:
|
| 157 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 158 |
+
cycle_lengths:
|
| 159 |
+
- "10000000000"
|
| 160 |
+
f_max:
|
| 161 |
+
- "1.0"
|
| 162 |
+
f_min:
|
| 163 |
+
- "1.0"
|
| 164 |
+
f_start:
|
| 165 |
+
- "1e-06"
|
| 166 |
+
warm_up_steps:
|
| 167 |
+
- "0"
|
| 168 |
+
enable_gradient_in_rollout: "True"
|
| 169 |
+
enable_preprocessors: "True"
|
| 170 |
+
fake_score: null
|
| 171 |
+
fake_score_net:
|
| 172 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 173 |
+
audio_hidden_size: "32"
|
| 174 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 175 |
+
in_dim: "65"
|
| 176 |
+
merge_lora: "False"
|
| 177 |
+
mode: v2v
|
| 178 |
+
model_size: 1.3B
|
| 179 |
+
net_pred_type: flow
|
| 180 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 181 |
+
schedule_type: rf
|
| 182 |
+
use_audio: "True"
|
| 183 |
+
fake_score_optimizer:
|
| 184 |
+
_target_: <function get_optimizer at 0x7ff2f5916660>
|
| 185 |
+
betas:
|
| 186 |
+
- "0.0"
|
| 187 |
+
- "0.999"
|
| 188 |
+
eps: "1e-08"
|
| 189 |
+
fused: "False"
|
| 190 |
+
lr: "2e-06"
|
| 191 |
+
model: null
|
| 192 |
+
optim_type: adamw
|
| 193 |
+
weight_decay: "0.01"
|
| 194 |
+
fake_score_pred_type: x0
|
| 195 |
+
fake_score_scheduler:
|
| 196 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 197 |
+
cycle_lengths:
|
| 198 |
+
- "10000000000"
|
| 199 |
+
f_max:
|
| 200 |
+
- "1.0"
|
| 201 |
+
f_min:
|
| 202 |
+
- "1.0"
|
| 203 |
+
f_start:
|
| 204 |
+
- "1e-06"
|
| 205 |
+
warm_up_steps:
|
| 206 |
+
- "0"
|
| 207 |
+
fsdp_meta_init: "False"
|
| 208 |
+
gan_loss_weight_gen: "0"
|
| 209 |
+
gan_r1_reg_alpha: "0.1"
|
| 210 |
+
gan_r1_reg_weight: "0.0"
|
| 211 |
+
gan_use_same_t_noise: "False"
|
| 212 |
+
grad_scaler_enabled: "False"
|
| 213 |
+
grad_scaler_growth_interval: "2000"
|
| 214 |
+
grad_scaler_init_scale: "65536.0"
|
| 215 |
+
guidance_scale: "4.5"
|
| 216 |
+
input_shape:
|
| 217 |
+
- "16"
|
| 218 |
+
- "21"
|
| 219 |
+
- "64"
|
| 220 |
+
- "64"
|
| 221 |
+
last_step_only: "False"
|
| 222 |
+
load_student_weights: "False"
|
| 223 |
+
net:
|
| 224 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 225 |
+
audio_hidden_size: "32"
|
| 226 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 227 |
+
chunk_size: "3"
|
| 228 |
+
in_dim: "65"
|
| 229 |
+
mode: v2v
|
| 230 |
+
model_size: 1.3B
|
| 231 |
+
net_pred_type: flow
|
| 232 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 233 |
+
schedule_type: rf
|
| 234 |
+
total_num_frames: "21"
|
| 235 |
+
use_audio: "True"
|
| 236 |
+
net_optimizer:
|
| 237 |
+
_target_: <function get_optimizer at 0x7ff2f5916660>
|
| 238 |
+
betas:
|
| 239 |
+
- "0.0"
|
| 240 |
+
- "0.999"
|
| 241 |
+
eps: "1e-08"
|
| 242 |
+
fused: "False"
|
| 243 |
+
lr: "2e-06"
|
| 244 |
+
model: null
|
| 245 |
+
optim_type: adamw
|
| 246 |
+
weight_decay: "0.01"
|
| 247 |
+
net_scheduler:
|
| 248 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 249 |
+
cycle_lengths:
|
| 250 |
+
- "10000000000"
|
| 251 |
+
f_max:
|
| 252 |
+
- "1.0"
|
| 253 |
+
f_min:
|
| 254 |
+
- "1.0"
|
| 255 |
+
f_start:
|
| 256 |
+
- "1e-06"
|
| 257 |
+
warm_up_steps:
|
| 258 |
+
- "0"
|
| 259 |
+
precision: bfloat16
|
| 260 |
+
precision_amp: null
|
| 261 |
+
precision_amp_enc: null
|
| 262 |
+
precision_amp_infer: null
|
| 263 |
+
precision_fsdp: bfloat16
|
| 264 |
+
pretrained_model_path: ""
|
| 265 |
+
pretrained_student_net_path: ""
|
| 266 |
+
same_step_across_blocks: "True"
|
| 267 |
+
sample_t_cfg:
|
| 268 |
+
log_t_df: "0.01"
|
| 269 |
+
max_t: "0.999"
|
| 270 |
+
min_t: "0.001"
|
| 271 |
+
shift: "5.0"
|
| 272 |
+
t_list:
|
| 273 |
+
- "0.999"
|
| 274 |
+
- "0.937"
|
| 275 |
+
- "0.833"
|
| 276 |
+
- "0.624"
|
| 277 |
+
- "0.0"
|
| 278 |
+
time_dist_type: shifted
|
| 279 |
+
train_p_mean: "-1.1"
|
| 280 |
+
train_p_std: "2.0"
|
| 281 |
+
skip_layers: null
|
| 282 |
+
start_gradient_frame: "0"
|
| 283 |
+
student_sample_steps: "4"
|
| 284 |
+
student_sample_type: sde
|
| 285 |
+
student_update_freq: "5"
|
| 286 |
+
teacher:
|
| 287 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 288 |
+
audio_hidden_size: "32"
|
| 289 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 290 |
+
in_dim: "65"
|
| 291 |
+
merge_lora: "True"
|
| 292 |
+
mode: v2v
|
| 293 |
+
model_size: 14B
|
| 294 |
+
net_pred_type: flow
|
| 295 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 296 |
+
schedule_type: rf
|
| 297 |
+
use_audio: "True"
|
| 298 |
+
use_ema: "False"
|
| 299 |
+
model_class:
|
| 300 |
+
value:
|
| 301 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 302 |
+
config: null
|
| 303 |
+
trainer:
|
| 304 |
+
value:
|
| 305 |
+
augment_pipe: null
|
| 306 |
+
batch_size_global: null
|
| 307 |
+
callbacks:
|
| 308 |
+
ema:
|
| 309 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 310 |
+
beta: "0.9999"
|
| 311 |
+
ema_halflife_kimg: "500"
|
| 312 |
+
ema_rampup_ratio: "0.05"
|
| 313 |
+
gamma: "16.97"
|
| 314 |
+
start_iter: "0"
|
| 315 |
+
type: constant
|
| 316 |
+
gpu_stats:
|
| 317 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 318 |
+
every_n: "100"
|
| 319 |
+
grad_clip:
|
| 320 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 321 |
+
grad_norm: "10.0"
|
| 322 |
+
model_key: net
|
| 323 |
+
param_count:
|
| 324 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 325 |
+
train_profiler:
|
| 326 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 327 |
+
every_n: "100"
|
| 328 |
+
wandb:
|
| 329 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 330 |
+
fps: "25"
|
| 331 |
+
sample_logging_iter: "100"
|
| 332 |
+
checkpointer:
|
| 333 |
+
pretrained_ckpt_key_map:
|
| 334 |
+
net: net
|
| 335 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 336 |
+
s3_container: s3://checkpoints/fastgen
|
| 337 |
+
s3_credential: ./credentials/s3.json
|
| 338 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints
|
| 339 |
+
use_s3: "False"
|
| 340 |
+
cudnn:
|
| 341 |
+
benchmark: "True"
|
| 342 |
+
deterministic: "False"
|
| 343 |
+
ddp: "False"
|
| 344 |
+
fsdp: "True"
|
| 345 |
+
fsdp_cpu_offload: "False"
|
| 346 |
+
fsdp_min_num_params: "10000000"
|
| 347 |
+
fsdp_sharding_group_size: null
|
| 348 |
+
global_vars: null
|
| 349 |
+
global_vars_val:
|
| 350 |
+
- null
|
| 351 |
+
grad_accum_rounds: "2"
|
| 352 |
+
logging_iter: "1"
|
| 353 |
+
max_iter: "20"
|
| 354 |
+
offload_module_in_decoding: "False"
|
| 355 |
+
resume: "False"
|
| 356 |
+
save_ckpt_iter: "100"
|
| 357 |
+
seed: "0"
|
| 358 |
+
skip_initial_validation: "True"
|
| 359 |
+
tf32_enabled: "True"
|
| 360 |
+
val_seed: null
|
| 361 |
+
validation_iter: "100"
|
| 362 |
+
visualize_teacher: "False"
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/output.log
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Apr 2, 2026 - 22:45:32 | INFO | fastgen.callbacks.wandb:init_wandb:259 ] Wandb code upload disabled (set WANDB_UPLOAD_CODE=true to enable)
|
| 2 |
+
[Apr 2, 2026 - 22:45:32 | SUCCESS | fastgen.trainer:__init__:53 ] Callbacks initialized successfully
|
| 3 |
+
[Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:__init__:57 ] Callback synchronization complete
|
| 4 |
+
[Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:__init__:60 ] Initializing checkpointer...
|
| 5 |
+
[Apr 2, 2026 - 22:45:33 | SUCCESS | fastgen.trainer:__init__:65 ] Checkpointer initialized successfully
|
| 6 |
+
[Apr 2, 2026 - 22:45:33 | SUCCESS | __main__:main:33 ] Trainer initialized successfully
|
| 7 |
+
[Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:run:77 ] Starting training
|
| 8 |
+
[Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:run:80 ] Initializing callbacks and model ...
|
| 9 |
+
[Apr 2, 2026 - 22:45:33 | INFO | fastgen.utils.checkpointer:load:151 ] Loading model from /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 10 |
+
[Apr 2, 2026 - 22:45:37 | INFO | fastgen.utils.checkpointer:load:154 ] Loading the model_dict...
|
| 11 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.checkpointer:load:159 ] Model net, loading info: <All keys matched successfully>
|
| 12 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:load_pretrained_ckpt:252 ] Loaded net model from net in /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth at iteration 5000
|
| 13 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:load_pretrained_ckpt:262 ] Setting resume_iter for model to 5000.
|
| 14 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:95 ] Starting model.on_train_begin ...
|
| 15 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.methods.model:on_train_begin:296 ] Teacher check: add_teacher_to_fsdp_dict=True, fsdp_dict keys=['net', 'fake_score', 'teacher'], teacher in fsdp_dict=True
|
| 16 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:99 ] model.on_train_begin completed
|
| 17 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:110 ] Wrapping model into fsdp ..
|
| 18 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:102 ] Fully sharding model with 4 ranks...
|
| 19 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'net' (1.42B params)...
|
| 20 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.networks.OmniAvatar.network_causal:fully_shard:1950 ] CausalOmniAvatarWan: keeping manual gradient checkpointing (not using apply_fsdp_checkpointing due to KV cache dynamics)
|
| 21 |
+
[Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 22 |
+
[Apr 2, 2026 - 22:45:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped net in 1.2s
|
| 23 |
+
[Apr 2, 2026 - 22:45:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'fake_score' (1.60B params)...
|
| 24 |
+
[Apr 2, 2026 - 22:45:50 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
|
| 25 |
+
[Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 26 |
+
[Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped fake_score in 1.5s
|
| 27 |
+
[Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'teacher' (14.29B params)...
|
| 28 |
+
[Apr 2, 2026 - 22:45:51 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
|
| 29 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
|
| 30 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped teacher in 11.0s
|
| 31 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:118 ] FSDP wrapping completed
|
| 32 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.callbacks.ema:on_model_init_end:64 ] EMA ema is not enabled, skipping callback.
|
| 33 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:133 ] Auto-Resume Details: None
|
| 34 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.basic_utils:set_random_seed:144 ] Using random seed 0.
|
| 35 |
+
[Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:165 ] Instantiating dataloader...
|
| 36 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.wandb:on_dataloader_init_end:361 ] SKIP_GT_VAL_UPLOAD=1 — skipping GT val video upload
|
| 37 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.gpu_stats:on_train_begin:57 ] every_n to measure gpus stats: 1
|
| 38 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.train_profiler:on_train_begin:54 ] every_n to profile trainer: 1
|
| 39 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] model (OmniAvatarSelfForcingModel) has 1596.36 M trainable and 17311.83 M total params (logical).
|
| 40 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.42 M trainable and 4350.43 M total params LOCAL on rank 0.
|
| 41 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 1.
|
| 42 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 2.
|
| 43 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 3.
|
| 44 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] net (CausalOmniAvatarWan) has 1421.38 M trainable and 1421.38 M total params (logical).
|
| 45 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.68 M trainable and 376.68 M total params LOCAL on rank 0.
|
| 46 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 1.
|
| 47 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 2.
|
| 48 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 3.
|
| 49 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] fake_score (OmniAvatarWan) has 174.98 M trainable and 1596.36 M total params (logical).
|
| 50 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.30 M total params LOCAL on rank 0.
|
| 51 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 1.
|
| 52 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 2.
|
| 53 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 3.
|
| 54 |
+
[Apr 2, 2026 - 22:46:08 | INFO | fastgen.trainer:run:174 ] iter_start: 0
|
| 55 |
+
[MEM] fake_score_update: START: alloc=9.45GB reserved=9.88GB peak=9.60GB
|
| 56 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=12.28GB reserved=49.39GB peak=45.74GB
|
| 57 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=13.16GB peak=45.74GB
|
| 58 |
+
[MEM-fwd] after block 0: alloc=13.74GB peak=45.74GB
|
| 59 |
+
[MEM-fwd] after block 10: alloc=19.55GB peak=45.74GB
|
| 60 |
+
[MEM-fwd] after block 20: alloc=24.84GB peak=45.74GB
|
| 61 |
+
[MEM-fwd] after block 29: alloc=29.59GB peak=45.74GB
|
| 62 |
+
[MEM-fwd] after head+unpatchify: alloc=30.67GB peak=45.74GB
|
| 63 |
+
[MEM] fake_score_update: START: alloc=13.25GB reserved=15.24GB peak=54.53GB
|
| 64 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.29GB reserved=47.38GB peak=46.75GB
|
| 65 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.17GB peak=46.75GB
|
| 66 |
+
[MEM-fwd] after block 0: alloc=14.75GB peak=46.75GB
|
| 67 |
+
[MEM-fwd] after block 10: alloc=20.56GB peak=46.75GB
|
| 68 |
+
[MEM-fwd] after block 20: alloc=25.85GB peak=46.75GB
|
| 69 |
+
[MEM-fwd] after block 29: alloc=30.61GB peak=46.75GB
|
| 70 |
+
[MEM-fwd] after head+unpatchify: alloc=31.69GB peak=46.75GB
|
| 71 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 72 |
+
Avg Max Min
|
| 73 |
+
cpu_mem_gb 38.492574 38.558720 38.459610
|
| 74 |
+
peak_gpu_mem_gb 51.700073 51.700073 51.700073
|
| 75 |
+
peak_gpu_mem_reserved_gb 53.640625 53.640625 53.640625
|
| 76 |
+
util 90.500000 97.000000 86.000000
|
| 77 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 1 : data loading time 0.81
|
| 78 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 1 : avg forward pass time 15.15
|
| 79 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 1 : backward pass time 11.36
|
| 80 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 1 : optimizer step time 1.28
|
| 81 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 1--------------------
|
| 82 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0068 iter count: 1.0
|
| 83 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0068 iter count: 1.0
|
| 84 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 85 |
+
[Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 86 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=57.60GB peak=55.51GB
|
| 87 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.47GB peak=46.93GB
|
| 88 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 89 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 90 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 91 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 92 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 93 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 94 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.37GB peak=55.69GB
|
| 95 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.25GB peak=47.71GB
|
| 96 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.71GB
|
| 97 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.71GB
|
| 98 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.71GB
|
| 99 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.71GB
|
| 100 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.71GB
|
| 101 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.71GB
|
| 102 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 103 |
+
Avg Max Min
|
| 104 |
+
cpu_mem_gb 38.666036 38.732494 38.632969
|
| 105 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 106 |
+
peak_gpu_mem_reserved_gb 54.365234 54.365234 54.365234
|
| 107 |
+
util 96.750000 100.000000 91.000000
|
| 108 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 2 : avg iteration time 51.61 seconds
|
| 109 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 2 : data loading time 0.00
|
| 110 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 2 : avg forward pass time 13.83
|
| 111 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 2 : backward pass time 11.39
|
| 112 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 2 : optimizer step time 1.16
|
| 113 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 2--------------------
|
| 114 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
|
| 115 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
|
| 116 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 117 |
+
[Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 118 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.37GB peak=56.47GB
|
| 119 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
|
| 120 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 121 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 122 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 123 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 124 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 125 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 126 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.40GB peak=55.69GB
|
| 127 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.29GB peak=47.72GB
|
| 128 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
|
| 129 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
|
| 130 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
|
| 131 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
|
| 132 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
|
| 133 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
|
| 134 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 135 |
+
Avg Max Min
|
| 136 |
+
cpu_mem_gb 38.666379 38.733353 38.633148
|
| 137 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 138 |
+
peak_gpu_mem_reserved_gb 54.404297 54.404297 54.404297
|
| 139 |
+
util 98.000000 100.000000 94.000000
|
| 140 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 3 : avg iteration time 57.83 seconds
|
| 141 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 3 : data loading time 0.00
|
| 142 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 3 : avg forward pass time 16.98
|
| 143 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 3 : backward pass time 11.32
|
| 144 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 3 : optimizer step time 1.16
|
| 145 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 3--------------------
|
| 146 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
|
| 147 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
|
| 148 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 149 |
+
[Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 150 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.42GB peak=56.47GB
|
| 151 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.52GB peak=46.93GB
|
| 152 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 153 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 154 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 155 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 156 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 157 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 158 |
+
[MEM] fake_score_update: START: alloc=14.21GB reserved=58.42GB peak=55.69GB
|
| 159 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.27GB peak=47.72GB
|
| 160 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
|
| 161 |
+
[MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
|
| 162 |
+
[MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
|
| 163 |
+
[MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
|
| 164 |
+
[MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
|
| 165 |
+
[MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
|
| 166 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
|
| 167 |
+
Avg Max Min
|
| 168 |
+
cpu_mem_gb 38.666442 38.733406 38.633190
|
| 169 |
+
peak_gpu_mem_gb 52.593685 52.593685 52.593685
|
| 170 |
+
peak_gpu_mem_reserved_gb 54.443359 54.443359 54.443359
|
| 171 |
+
util 96.500000 99.000000 92.000000
|
| 172 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 4 : avg iteration time 57.83 seconds
|
| 173 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 4 : data loading time 0.00
|
| 174 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 4 : avg forward pass time 16.97
|
| 175 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 4 : backward pass time 11.34
|
| 176 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 4 : optimizer step time 1.19
|
| 177 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 4--------------------
|
| 178 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
|
| 179 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
|
| 180 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
|
| 181 |
+
[Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
|
| 182 |
+
[MEM] fake_score_update: START: alloc=13.43GB reserved=58.46GB peak=56.47GB
|
| 183 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
|
| 184 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
|
| 185 |
+
[MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
|
| 186 |
+
[MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
|
| 187 |
+
[MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
|
| 188 |
+
[MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
|
| 189 |
+
[MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
|
| 190 |
+
[MEM] student_update: START: alloc=14.32GB reserved=58.46GB peak=55.75GB
|
| 191 |
+
[MEM] student_update: after rollout: alloc=63.85GB reserved=66.24GB peak=65.62GB
|
| 192 |
+
[MEM] student_update: after perturb: alloc=63.87GB reserved=66.24GB peak=65.62GB
|
| 193 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=64.62GB peak=65.62GB
|
| 194 |
+
[MEM-fwd] after block 0: alloc=64.67GB peak=75.08GB
|
| 195 |
+
[MEM-fwd] after block 10: alloc=65.20GB peak=75.61GB
|
| 196 |
+
[MEM-fwd] after block 20: alloc=65.20GB peak=75.61GB
|
| 197 |
+
[MEM-fwd] after block 29: alloc=65.20GB peak=75.61GB
|
| 198 |
+
[MEM-fwd] after head+unpatchify: alloc=64.69GB peak=75.61GB
|
| 199 |
+
[MEM] student_update: after fake_score: alloc=63.95GB reserved=78.02GB peak=75.61GB
|
| 200 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.83GB peak=75.61GB
|
| 201 |
+
[MEM-fwd] after block 0: alloc=67.05GB peak=88.90GB
|
| 202 |
+
[MEM-fwd] after block 10: alloc=68.82GB peak=90.66GB
|
| 203 |
+
[MEM-fwd] after block 20: alloc=68.82GB peak=90.66GB
|
| 204 |
+
[MEM-fwd] after block 30: alloc=68.82GB peak=90.66GB
|
| 205 |
+
[MEM-fwd] after block 39: alloc=68.82GB peak=90.66GB
|
| 206 |
+
[MEM-fwd] after head+unpatchify: alloc=67.08GB peak=90.66GB
|
| 207 |
+
[MEM] student_update: after teacher: alloc=64.45GB reserved=93.79GB peak=90.66GB
|
| 208 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.37GB peak=90.66GB
|
| 209 |
+
[MEM-fwd] after block 0: alloc=67.08GB peak=90.66GB
|
| 210 |
+
[MEM-fwd] after block 10: alloc=68.84GB peak=90.68GB
|
| 211 |
+
[MEM-fwd] after block 20: alloc=68.84GB peak=90.68GB
|
| 212 |
+
[MEM-fwd] after block 30: alloc=68.84GB peak=90.68GB
|
| 213 |
+
[MEM-fwd] after block 39: alloc=68.84GB peak=90.68GB
|
| 214 |
+
[MEM-fwd] after head+unpatchify: alloc=67.10GB peak=90.68GB
|
| 215 |
+
[MEM] student_update: after CFG: alloc=64.45GB reserved=93.83GB peak=90.68GB
|
| 216 |
+
[MEM] student_update: after VSD loss: alloc=64.47GB reserved=93.83GB peak=90.68GB
|
| 217 |
+
[MEM] fake_score_update: START: alloc=12.68GB reserved=72.00GB peak=90.68GB
|
| 218 |
+
[MEM] fake_score_update: after student gen (no_grad): alloc=15.51GB reserved=52.62GB peak=48.98GB
|
| 219 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.28GB peak=48.98GB
|
| 220 |
+
[MEM-fwd] after block 0: alloc=16.91GB peak=48.98GB
|
| 221 |
+
[MEM-fwd] after block 10: alloc=22.73GB peak=48.98GB
|
| 222 |
+
[MEM-fwd] after block 20: alloc=28.01GB peak=48.98GB
|
| 223 |
+
[MEM-fwd] after block 29: alloc=32.77GB peak=48.98GB
|
| 224 |
+
[MEM-fwd] after head+unpatchify: alloc=33.85GB peak=48.98GB
|
| 225 |
+
[MEM] student_update: START: alloc=15.52GB reserved=61.80GB peak=56.95GB
|
| 226 |
+
[MEM] student_update: after rollout: alloc=65.05GB reserved=68.04GB peak=66.83GB
|
| 227 |
+
[MEM] student_update: after perturb: alloc=65.07GB reserved=68.04GB peak=66.83GB
|
| 228 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=65.82GB peak=66.83GB
|
| 229 |
+
[MEM-fwd] after block 0: alloc=65.87GB peak=76.28GB
|
| 230 |
+
[MEM-fwd] after block 10: alloc=66.40GB peak=76.81GB
|
| 231 |
+
[MEM-fwd] after block 20: alloc=66.40GB peak=76.81GB
|
| 232 |
+
[MEM-fwd] after block 29: alloc=66.40GB peak=76.81GB
|
| 233 |
+
[MEM-fwd] after head+unpatchify: alloc=65.90GB peak=76.81GB
|
| 234 |
+
[MEM] student_update: after fake_score: alloc=65.15GB reserved=79.81GB peak=76.81GB
|
| 235 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.08GB peak=76.81GB
|
| 236 |
+
[MEM-fwd] after block 0: alloc=67.78GB peak=89.62GB
|
| 237 |
+
[MEM-fwd] after block 10: alloc=69.54GB peak=91.39GB
|
| 238 |
+
[MEM-fwd] after block 20: alloc=69.54GB peak=91.39GB
|
| 239 |
+
[MEM-fwd] after block 30: alloc=69.54GB peak=91.39GB
|
| 240 |
+
[MEM-fwd] after block 39: alloc=69.54GB peak=91.39GB
|
| 241 |
+
[MEM-fwd] after head+unpatchify: alloc=67.80GB peak=91.39GB
|
| 242 |
+
[MEM] student_update: after teacher: alloc=65.17GB reserved=94.89GB peak=91.39GB
|
| 243 |
+
[MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.10GB peak=91.39GB
|
| 244 |
+
[MEM-fwd] after block 0: alloc=67.80GB peak=91.39GB
|
| 245 |
+
[MEM-fwd] after block 10: alloc=69.57GB peak=91.41GB
|
| 246 |
+
[MEM-fwd] after block 20: alloc=69.57GB peak=91.41GB
|
| 247 |
+
[MEM-fwd] after block 30: alloc=69.57GB peak=91.41GB
|
| 248 |
+
[MEM-fwd] after block 39: alloc=69.57GB peak=91.41GB
|
| 249 |
+
[MEM-fwd] after head+unpatchify: alloc=67.83GB peak=91.41GB
|
| 250 |
+
[MEM] student_update: after CFG: alloc=65.17GB reserved=94.91GB peak=91.41GB
|
| 251 |
+
[MEM] student_update: after VSD loss: alloc=65.20GB reserved=94.91GB peak=91.41GB
|
| 252 |
+
Traceback (most recent call last):
|
| 253 |
+
File "/home/work/.local/hyunbin/FastGen/train.py", line 46, in <module>
|
| 254 |
+
main(config)
|
| 255 |
+
File "/home/work/.local/hyunbin/FastGen/train.py", line 37, in main
|
| 256 |
+
fastgen_trainer.run(model)
|
| 257 |
+
File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 194, in run
|
| 258 |
+
loss_map, outputs = self.train_step(model_ddp, model, data, iter_cur, grad_accum_iter)
|
| 259 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 260 |
+
File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 331, in train_step
|
| 261 |
+
model.grad_scaler.scale(loss_map["total_loss"] / grad_accum_rounds).backward()
|
| 262 |
+
File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/_tensor.py", line 630, in backward
|
| 263 |
+
torch.autograd.backward(
|
| 264 |
+
File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/__init__.py", line 364, in backward
|
| 265 |
+
_engine_run_backward(
|
| 266 |
+
File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/graph.py", line 865, in _engine_run_backward
|
| 267 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 268 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 269 |
+
File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 1177, in unpack_hook
|
| 270 |
+
frame.check_recomputed_tensors_match(gid)
|
| 271 |
+
File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 882, in check_recomputed_tensors_match
|
| 272 |
+
raise CheckpointError(
|
| 273 |
+
torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: A different number of tensors was saved during the original forward and recomputation.
|
| 274 |
+
Number of tensors saved during forward: 94
|
| 275 |
+
Number of tensors saved during recomputation: 80.
|
| 276 |
+
|
| 277 |
+
Tip: To see a more detailed error message, either pass `debug=True` to
|
| 278 |
+
`torch.utils.checkpoint.checkpoint(...)` or wrap the code block
|
| 279 |
+
with `with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):` to
|
| 280 |
+
enable checkpoint‑debug mode globally.
|
| 281 |
+
|
| 282 |
+
[rank0]: Traceback (most recent call last):
|
| 283 |
+
[rank0]: File "/home/work/.local/hyunbin/FastGen/train.py", line 46, in <module>
|
| 284 |
+
[rank0]: main(config)
|
| 285 |
+
[rank0]: File "/home/work/.local/hyunbin/FastGen/train.py", line 37, in main
|
| 286 |
+
[rank0]: fastgen_trainer.run(model)
|
| 287 |
+
[rank0]: File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 194, in run
|
| 288 |
+
[rank0]: loss_map, outputs = self.train_step(model_ddp, model, data, iter_cur, grad_accum_iter)
|
| 289 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 290 |
+
[rank0]: File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 331, in train_step
|
| 291 |
+
[rank0]: model.grad_scaler.scale(loss_map["total_loss"] / grad_accum_rounds).backward()
|
| 292 |
+
[rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/_tensor.py", line 630, in backward
|
| 293 |
+
[rank0]: torch.autograd.backward(
|
| 294 |
+
[rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/__init__.py", line 364, in backward
|
| 295 |
+
[rank0]: _engine_run_backward(
|
| 296 |
+
[rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/graph.py", line 865, in _engine_run_backward
|
| 297 |
+
[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 298 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 299 |
+
[rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 1177, in unpack_hook
|
| 300 |
+
[rank0]: frame.check_recomputed_tensors_match(gid)
|
| 301 |
+
[rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 882, in check_recomputed_tensors_match
|
| 302 |
+
[rank0]: raise CheckpointError(
|
| 303 |
+
[rank0]: torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: A different number of tensors was saved during the original forward and recomputation.
|
| 304 |
+
[rank0]: Number of tensors saved during forward: 94
|
| 305 |
+
[rank0]: Number of tensors saved during recomputation: 80.
|
| 306 |
+
|
| 307 |
+
[rank0]: Tip: To see a more detailed error message, either pass `debug=True` to
|
| 308 |
+
[rank0]: `torch.utils.checkpoint.checkpoint(...)` or wrap the code block
|
| 309 |
+
[rank0]: with `with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):` to
|
| 310 |
+
[rank0]: enable checkpoint‑debug mode globally.
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/requirements.txt
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastgen==0.1.0
|
| 2 |
+
nvitop==1.6.1
|
| 3 |
+
ftfy==6.3.1
|
| 4 |
+
braceexpand==0.1.7
|
| 5 |
+
antlr4-python3-runtime==4.9.3
|
| 6 |
+
webdataset==1.0.2
|
| 7 |
+
sentry-sdk==2.53.0
|
| 8 |
+
rdkit==2025.9.5
|
| 9 |
+
python-dotenv==1.2.1
|
| 10 |
+
proglog==0.1.12
|
| 11 |
+
omegaconf==2.3.0
|
| 12 |
+
narwhals==2.17.0
|
| 13 |
+
loguru==0.7.3
|
| 14 |
+
imageio-ffmpeg==0.6.0
|
| 15 |
+
plotly==6.5.2
|
| 16 |
+
moviepy==2.2.1
|
| 17 |
+
hydra-core==1.3.2
|
| 18 |
+
wandb==0.25.0
|
| 19 |
+
fastgen==0.1.0
|
| 20 |
+
packaging==25.0
|
| 21 |
+
setuptools==80.10.2
|
| 22 |
+
wheel==0.46.3
|
| 23 |
+
pip==26.0.1
|
| 24 |
+
webencodings==0.5.1
|
| 25 |
+
pure_eval==0.2.3
|
| 26 |
+
ptyprocess==0.7.0
|
| 27 |
+
nvidia-ml-py==13.590.48
|
| 28 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 29 |
+
mpmath==1.3.0
|
| 30 |
+
fastjsonschema==2.21.2
|
| 31 |
+
zipp==3.23.0
|
| 32 |
+
xyzservices==2025.11.0
|
| 33 |
+
widgetsnbextension==4.0.15
|
| 34 |
+
websocket-client==1.9.0
|
| 35 |
+
webcolors==25.10.0
|
| 36 |
+
wcwidth==0.6.0
|
| 37 |
+
urllib3==2.6.3
|
| 38 |
+
uri-template==1.3.0
|
| 39 |
+
tzdata==2025.3
|
| 40 |
+
typing_extensions==4.15.0
|
| 41 |
+
triton==3.6.0
|
| 42 |
+
traitlets==5.14.3
|
| 43 |
+
tqdm==4.67.3
|
| 44 |
+
tornado==6.5.5
|
| 45 |
+
tinycss2==1.4.0
|
| 46 |
+
sympy==1.14.0
|
| 47 |
+
soupsieve==2.8.3
|
| 48 |
+
smmap==5.0.3
|
| 49 |
+
six==1.16.0
|
| 50 |
+
sentencepiece==0.2.1
|
| 51 |
+
Send2Trash==2.1.0
|
| 52 |
+
safetensors==0.7.0
|
| 53 |
+
rpds-py==0.30.0
|
| 54 |
+
rfc3986-validator==0.1.1
|
| 55 |
+
regex==2026.2.28
|
| 56 |
+
pyzmq==27.1.0
|
| 57 |
+
PyYAML==6.0.3
|
| 58 |
+
python-json-logger==4.0.0
|
| 59 |
+
Pygments==2.19.2
|
| 60 |
+
pycparser==3.0
|
| 61 |
+
psutil==7.2.2
|
| 62 |
+
protobuf==4.24.4
|
| 63 |
+
prometheus_client==0.24.1
|
| 64 |
+
platformdirs==4.9.4
|
| 65 |
+
pillow==11.3.0
|
| 66 |
+
pexpect==4.9.0
|
| 67 |
+
parso==0.8.6
|
| 68 |
+
pandocfilters==1.5.1
|
| 69 |
+
nvidia-nvtx-cu12==12.8.90
|
| 70 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 71 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 72 |
+
nvidia-nccl-cu12==2.27.5
|
| 73 |
+
nvidia-curand-cu12==10.3.9.90
|
| 74 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 76 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 77 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 78 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 79 |
+
numpy==1.26.4
|
| 80 |
+
networkx==3.6.1
|
| 81 |
+
nest-asyncio==1.6.0
|
| 82 |
+
mistune==3.2.0
|
| 83 |
+
MarkupSafe==3.0.3
|
| 84 |
+
lark==1.3.1
|
| 85 |
+
jupyterlab_widgets==3.0.16
|
| 86 |
+
jupyterlab_pygments==0.3.0
|
| 87 |
+
jsonpointer==3.0.0
|
| 88 |
+
json5==0.13.0
|
| 89 |
+
jmespath==1.1.0
|
| 90 |
+
idna==3.11
|
| 91 |
+
hf-xet==1.4.2
|
| 92 |
+
h11==0.16.0
|
| 93 |
+
fsspec==2026.2.0
|
| 94 |
+
fqdn==1.5.1
|
| 95 |
+
filelock==3.25.2
|
| 96 |
+
executing==2.2.1
|
| 97 |
+
einops==0.8.2
|
| 98 |
+
defusedxml==0.7.1
|
| 99 |
+
decorator==5.2.1
|
| 100 |
+
debugpy==1.8.20
|
| 101 |
+
cuda-pathfinder==1.4.2
|
| 102 |
+
comm==0.2.3
|
| 103 |
+
click==8.3.1
|
| 104 |
+
charset-normalizer==3.4.5
|
| 105 |
+
certifi==2026.2.25
|
| 106 |
+
bleach==6.3.0
|
| 107 |
+
babel==2.18.0
|
| 108 |
+
av==17.0.0
|
| 109 |
+
attrs==25.4.0
|
| 110 |
+
async-lru==2.2.0
|
| 111 |
+
asttokens==3.0.1
|
| 112 |
+
annotated-types==0.7.0
|
| 113 |
+
typing-inspection==0.4.2
|
| 114 |
+
terminado==0.18.1
|
| 115 |
+
stack-data==0.6.3
|
| 116 |
+
scipy==1.17.1
|
| 117 |
+
rfc3987-syntax==1.1.0
|
| 118 |
+
rfc3339-validator==0.1.4
|
| 119 |
+
requests==2.32.5
|
| 120 |
+
referencing==0.37.0
|
| 121 |
+
python-dateutil==2.9.0.post0
|
| 122 |
+
pydantic_core==2.41.5
|
| 123 |
+
prompt_toolkit==3.0.52
|
| 124 |
+
opencv-python-headless==4.11.0.86
|
| 125 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 126 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 127 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 128 |
+
matplotlib-inline==0.2.1
|
| 129 |
+
jupyter_core==5.9.1
|
| 130 |
+
Jinja2==3.1.6
|
| 131 |
+
jedi==0.19.2
|
| 132 |
+
ipython_pygments_lexers==1.1.1
|
| 133 |
+
importlib_metadata==8.7.1
|
| 134 |
+
ImageIO==2.37.3
|
| 135 |
+
httpcore==1.0.9
|
| 136 |
+
gitdb==4.0.12
|
| 137 |
+
cuda-bindings==12.9.4
|
| 138 |
+
contourpy==1.3.3
|
| 139 |
+
cffi==2.0.0
|
| 140 |
+
beautifulsoup4==4.14.3
|
| 141 |
+
anyio==4.12.1
|
| 142 |
+
soundfile==0.13.1
|
| 143 |
+
pydantic==2.12.5
|
| 144 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 145 |
+
jupyter_server_terminals==0.5.4
|
| 146 |
+
jupyter_client==8.8.0
|
| 147 |
+
jsonschema-specifications==2025.9.1
|
| 148 |
+
ipython==9.11.0
|
| 149 |
+
httpx==0.28.1
|
| 150 |
+
GitPython==3.1.46
|
| 151 |
+
botocore==1.42.68
|
| 152 |
+
bokeh==3.9.0
|
| 153 |
+
arrow==1.4.0
|
| 154 |
+
argon2-cffi-bindings==25.1.0
|
| 155 |
+
torch==2.10.0
|
| 156 |
+
s3transfer==0.16.0
|
| 157 |
+
jsonschema==4.26.0
|
| 158 |
+
isoduration==20.11.0
|
| 159 |
+
ipywidgets==8.1.8
|
| 160 |
+
ipykernel==7.2.0
|
| 161 |
+
argon2-cffi==25.1.0
|
| 162 |
+
torchvision==0.25.0
|
| 163 |
+
nbformat==5.10.4
|
| 164 |
+
jupyter-console==6.6.3
|
| 165 |
+
boto3==1.42.68
|
| 166 |
+
accelerate==1.13.0
|
| 167 |
+
nbclient==0.10.4
|
| 168 |
+
jupyter-events==0.12.0
|
| 169 |
+
nbconvert==7.17.0
|
| 170 |
+
jupyter_server==2.17.0
|
| 171 |
+
notebook_shim==0.2.4
|
| 172 |
+
jupyterlab_server==2.28.0
|
| 173 |
+
jupyter-lsp==2.3.0
|
| 174 |
+
jupyterlab==4.5.6
|
| 175 |
+
notebook==7.5.5
|
| 176 |
+
jupyter==1.1.1
|
| 177 |
+
fastgen==0.1.0
|
| 178 |
+
pandas==3.0.1
|
| 179 |
+
shellingham==1.5.4
|
| 180 |
+
mdurl==0.1.2
|
| 181 |
+
annotated-doc==0.0.4
|
| 182 |
+
markdown-it-py==4.0.0
|
| 183 |
+
rich==14.3.3
|
| 184 |
+
typer==0.24.1
|
| 185 |
+
huggingface_hub==1.7.1
|
| 186 |
+
timm==1.0.25
|
| 187 |
+
tokenizers==0.22.2
|
| 188 |
+
diffusers==0.37.0
|
| 189 |
+
transformers==5.3.0
|
| 190 |
+
peft==0.18.1
|
| 191 |
+
easydict==1.13
|
| 192 |
+
lmdb==2.2.0
|
| 193 |
+
threadpoolctl==3.6.0
|
| 194 |
+
soxr==1.0.0
|
| 195 |
+
msgpack==1.1.2
|
| 196 |
+
llvmlite==0.47.0
|
| 197 |
+
lazy-loader==0.5
|
| 198 |
+
joblib==1.5.3
|
| 199 |
+
audioread==3.1.0
|
| 200 |
+
scikit-learn==1.8.0
|
| 201 |
+
pooch==1.9.0
|
| 202 |
+
numba==0.65.0
|
| 203 |
+
librosa==0.11.0
|
| 204 |
+
simsimd==6.5.16
|
| 205 |
+
flatbuffers==25.12.19
|
| 206 |
+
tifffile==2026.3.3
|
| 207 |
+
stringzilla==4.6.0
|
| 208 |
+
pyparsing==3.3.2
|
| 209 |
+
prettytable==3.17.0
|
| 210 |
+
onnx==1.17.0
|
| 211 |
+
kiwisolver==1.5.0
|
| 212 |
+
fonttools==4.62.1
|
| 213 |
+
Cython==3.2.4
|
| 214 |
+
cycler==0.12.1
|
| 215 |
+
scikit-image==0.26.0
|
| 216 |
+
onnxruntime==1.24.4
|
| 217 |
+
matplotlib==3.10.8
|
| 218 |
+
albucore==0.0.24
|
| 219 |
+
albumentations==2.0.8
|
| 220 |
+
insightface==0.7.3
|
| 221 |
+
kornia_rs==0.1.10
|
| 222 |
+
kornia==0.8.2
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
|
| 3 |
+
"python": "CPython 3.12.12",
|
| 4 |
+
"startedAt": "2026-04-02T13:45:31.145420Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
|
| 7 |
+
"-",
|
| 8 |
+
"trainer.resume=False",
|
| 9 |
+
"log_config.name=sf_combined_step_test",
|
| 10 |
+
"log_config.project=OmniAvatar-FastGen",
|
| 11 |
+
"trainer.max_iter=20"
|
| 12 |
+
],
|
| 13 |
+
"program": "/home/work/.local/hyunbin/FastGen/train.py",
|
| 14 |
+
"codePath": "train.py",
|
| 15 |
+
"codePathLocal": "train.py",
|
| 16 |
+
"git": {
|
| 17 |
+
"remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
|
| 18 |
+
"commit": "04de80beaf50f849c12a55a5d8358d94530b7bb5"
|
| 19 |
+
},
|
| 20 |
+
"email": "paul.hyunbin@gmail.com",
|
| 21 |
+
"root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test",
|
| 22 |
+
"host": "main1",
|
| 23 |
+
"executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
|
| 24 |
+
"cpu_count": 112,
|
| 25 |
+
"cpu_count_logical": 224,
|
| 26 |
+
"gpu": "NVIDIA H200",
|
| 27 |
+
"gpu_count": 4,
|
| 28 |
+
"disk": {
|
| 29 |
+
"/": {
|
| 30 |
+
"total": "1356758433792",
|
| 31 |
+
"used": "257963536384"
|
| 32 |
+
}
|
| 33 |
+
},
|
| 34 |
+
"memory": {
|
| 35 |
+
"total": "2163961778176"
|
| 36 |
+
},
|
| 37 |
+
"gpu_nvidia": [
|
| 38 |
+
{
|
| 39 |
+
"name": "NVIDIA H200",
|
| 40 |
+
"memoryTotal": "150754820096",
|
| 41 |
+
"cudaCores": 16896,
|
| 42 |
+
"architecture": "Hopper",
|
| 43 |
+
"uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"name": "NVIDIA H200",
|
| 47 |
+
"memoryTotal": "150754820096",
|
| 48 |
+
"cudaCores": 16896,
|
| 49 |
+
"architecture": "Hopper",
|
| 50 |
+
"uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "NVIDIA H200",
|
| 54 |
+
"memoryTotal": "150754820096",
|
| 55 |
+
"cudaCores": 16896,
|
| 56 |
+
"architecture": "Hopper",
|
| 57 |
+
"uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H200",
|
| 61 |
+
"memoryTotal": "150754820096",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper",
|
| 64 |
+
"uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
|
| 65 |
+
}
|
| 66 |
+
],
|
| 67 |
+
"cudaVersion": "12.9",
|
| 68 |
+
"writerId": "mf7qlulwtbceq6fkw52thj4sgf25e3dz"
|
| 69 |
+
}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"fake_score/local_total_params":[399300904,399021352,399021352,399021352],"optimizer/non_finite_grads_count (model_key net)":0,"_wandb":{"runtime":445},"optimizer/grad_norm (model_key net)":0,"_runtime":445.189429909,"fake_score/total_params":1596364960,"profiler/avg_forward_pass_time":16.967717550694942,"model/trainable_params":1596364960,"model/total_params":17311830496,"train/fake_score_loss":0.0072784423828125,"_timestamp":1.7751377931214402e+09,"profiler/avg_iteration_time":57.83046340942383,"profiler/data_loading_time":0.000791529193520546,"fake_score/local_trainable_params":[43745280,43745280,43745280,43745280],"profiler/backward_pass_time":11.342406308278441,"net/local_total_params":[376675360,376398880,376398880,376398880],"train/total_loss":0.0072784423828125,"model/local_total_params":[4350425968,4348630896,4348630896,4348630896],"_step":4,"optimizer/lr_fake_score":2e-06,"net/trainable_params":1421383840,"train/gan_loss_disc":0,"profiler/optimizer_step_time":1.187411269173026,"net/total_params":1421383840,"optimizer/lr_net":2e-06,"model/local_trainable_params":[420420640,420144160,420144160,420144160],"fake_score/trainable_params":174981120,"net/local_trainable_params":[376675360,376398880,376398880,376398880],"optimizer/iteration":4}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-core.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T22:45:31.195756192+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphkxult0z/port-792541.txt","pid":792541,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-02T22:45:31.196239435+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":792541}
|
| 3 |
+
{"time":"2026-04-02T22:45:31.196228389+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-792541-806747-3978962853/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-04-02T22:45:31.38167949+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-02T22:45:31.396742512+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"spcd04xe","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-02T22:45:31.746828856+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"spcd04xe","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-02T22:45:37.997230573+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"ve1uhd81uimn"}
|
| 8 |
+
{"time":"2026-04-02T22:52:57.910992536+09:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 9 |
+
{"time":"2026-04-02T22:52:57.911042628+09:00","level":"INFO","msg":"server is shutting down"}
|
| 10 |
+
{"time":"2026-04-02T22:52:57.911035318+09:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 11 |
+
{"time":"2026-04-02T22:52:57.911115888+09:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 12 |
+
{"time":"2026-04-02T22:52:57.911110682+09:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-792541-806747-3978962853/socket","Net":"unix"}}
|
| 13 |
+
{"time":"2026-04-02T22:52:58.870224925+09:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 14 |
+
{"time":"2026-04-02T22:52:58.87024994+09:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 15 |
+
{"time":"2026-04-02T22:52:58.870261844+09:00","level":"INFO","msg":"server is closed"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-02T22:45:31.396833745+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
|
| 2 |
+
{"time":"2026-04-02T22:45:31.746696076+09:00","level":"INFO","msg":"stream: created new stream","id":"spcd04xe"}
|
| 3 |
+
{"time":"2026-04-02T22:45:31.746744779+09:00","level":"INFO","msg":"handler: started","stream_id":"spcd04xe"}
|
| 4 |
+
{"time":"2026-04-02T22:45:31.746822827+09:00","level":"INFO","msg":"stream: started","id":"spcd04xe"}
|
| 5 |
+
{"time":"2026-04-02T22:45:31.746841154+09:00","level":"INFO","msg":"sender: started","stream_id":"spcd04xe"}
|
| 6 |
+
{"time":"2026-04-02T22:45:31.74684523+09:00","level":"INFO","msg":"writer: started","stream_id":"spcd04xe"}
|
| 7 |
+
{"time":"2026-04-02T22:52:57.91103952+09:00","level":"INFO","msg":"stream: closing","id":"spcd04xe"}
|
| 8 |
+
{"time":"2026-04-02T22:52:58.486184439+09:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-04-02T22:52:58.869727008+09:00","level":"INFO","msg":"handler: closed","stream_id":"spcd04xe"}
|
| 10 |
+
{"time":"2026-04-02T22:52:58.869816672+09:00","level":"INFO","msg":"sender: closed","stream_id":"spcd04xe"}
|
| 11 |
+
{"time":"2026-04-02T22:52:58.869827326+09:00","level":"INFO","msg":"stream: closed","id":"spcd04xe"}
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
|
| 2 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Configure stats pid to 792541
|
| 3 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log
|
| 5 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log
|
| 6 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_combined_step_test', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '20', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7ff2f56a3ce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
|
| 9 |
+
2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-04-02 22:45:31,381 INFO MainThread:792541 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-04-02 22:45:31,394 INFO MainThread:792541 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-04-02 22:45:31,398 INFO MainThread:792541 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-04-02 22:45:31,413 INFO MainThread:792541 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-02 22:45:32,719 INFO MainThread:792541 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_console_start():2524] atexit reg
|
| 16 |
+
2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_redirect():2373] redirect: wrap_raw
|
| 17 |
+
2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2442] Wrapping output streams.
|
| 18 |
+
2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2465] Redirects installed.
|
| 19 |
+
2026-04-02 22:45:32,998 INFO MainThread:792541 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [service_client.py:_forward_responses():134] Reached EOF.
|
| 21 |
+
2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/run-spcd04xe.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c85a6826c3e4463f05f583260010422503fd8df994d4cb8222a7140a5f63297a
|
| 3 |
+
size 132588
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb_id.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
spcd04xe
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/config.yaml
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataloader_train:
|
| 2 |
+
_target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
|
| 3 |
+
batch_size: '8'
|
| 4 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
|
| 5 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 6 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 7 |
+
num_workers: '2'
|
| 8 |
+
use_ref_sequence: 'True'
|
| 9 |
+
dataloader_val:
|
| 10 |
+
_target_: <function create_omniavatar_dataloader at 0x7feb4c60be20>
|
| 11 |
+
batch_size: '1'
|
| 12 |
+
data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
|
| 13 |
+
latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
|
| 14 |
+
load_ode_path: 'False'
|
| 15 |
+
neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
|
| 16 |
+
num_workers: '2'
|
| 17 |
+
use_ref_sequence: 'True'
|
| 18 |
+
eval:
|
| 19 |
+
max_ckpt: '100000000'
|
| 20 |
+
min_ckpt: '0'
|
| 21 |
+
num_samples: '50000'
|
| 22 |
+
samples_dir: samples
|
| 23 |
+
save_images: 'False'
|
| 24 |
+
log_config:
|
| 25 |
+
group: omniavatar_sf
|
| 26 |
+
name: sf_no_reqgrad_toggle
|
| 27 |
+
project: OmniAvatar-FastGen
|
| 28 |
+
wandb_credential: ./credentials/wandb_api.txt
|
| 29 |
+
wandb_entity: paulhcho
|
| 30 |
+
wandb_mode: disabled
|
| 31 |
+
model:
|
| 32 |
+
add_teacher_to_fsdp_dict: 'True'
|
| 33 |
+
context_noise: '0.0'
|
| 34 |
+
ddp_find_unused_parameters: 'True'
|
| 35 |
+
device: cuda
|
| 36 |
+
discriminator:
|
| 37 |
+
_target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
|
| 38 |
+
all_res:
|
| 39 |
+
- '32'
|
| 40 |
+
- '16'
|
| 41 |
+
- '8'
|
| 42 |
+
feature_indices: '{0, 1, 2}'
|
| 43 |
+
in_channels: '256'
|
| 44 |
+
discriminator_optimizer:
|
| 45 |
+
_target_: <function get_optimizer at 0x7feb4c86a660>
|
| 46 |
+
betas:
|
| 47 |
+
- '0.9'
|
| 48 |
+
- '0.999'
|
| 49 |
+
eps: 1e-08
|
| 50 |
+
fused: 'False'
|
| 51 |
+
lr: '0.0001'
|
| 52 |
+
model: null
|
| 53 |
+
optim_type: adamw
|
| 54 |
+
weight_decay: '0.01'
|
| 55 |
+
discriminator_scheduler:
|
| 56 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 57 |
+
cycle_lengths:
|
| 58 |
+
- '10000000000'
|
| 59 |
+
f_max:
|
| 60 |
+
- '1.0'
|
| 61 |
+
f_min:
|
| 62 |
+
- '1.0'
|
| 63 |
+
f_start:
|
| 64 |
+
- 1e-06
|
| 65 |
+
warm_up_steps:
|
| 66 |
+
- '0'
|
| 67 |
+
enable_gradient_in_rollout: 'True'
|
| 68 |
+
enable_preprocessors: 'True'
|
| 69 |
+
fake_score: null
|
| 70 |
+
fake_score_net:
|
| 71 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 72 |
+
audio_hidden_size: '32'
|
| 73 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 74 |
+
in_dim: '65'
|
| 75 |
+
merge_lora: 'False'
|
| 76 |
+
mode: v2v
|
| 77 |
+
model_size: 1.3B
|
| 78 |
+
net_pred_type: flow
|
| 79 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 80 |
+
schedule_type: rf
|
| 81 |
+
use_audio: 'True'
|
| 82 |
+
fake_score_optimizer:
|
| 83 |
+
_target_: <function get_optimizer at 0x7feb4c86a660>
|
| 84 |
+
betas:
|
| 85 |
+
- '0.0'
|
| 86 |
+
- '0.999'
|
| 87 |
+
eps: 1e-08
|
| 88 |
+
fused: 'False'
|
| 89 |
+
lr: 2e-06
|
| 90 |
+
model: null
|
| 91 |
+
optim_type: adamw
|
| 92 |
+
weight_decay: '0.01'
|
| 93 |
+
fake_score_pred_type: x0
|
| 94 |
+
fake_score_scheduler:
|
| 95 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 96 |
+
cycle_lengths:
|
| 97 |
+
- '10000000000'
|
| 98 |
+
f_max:
|
| 99 |
+
- '1.0'
|
| 100 |
+
f_min:
|
| 101 |
+
- '1.0'
|
| 102 |
+
f_start:
|
| 103 |
+
- 1e-06
|
| 104 |
+
warm_up_steps:
|
| 105 |
+
- '0'
|
| 106 |
+
fsdp_meta_init: 'False'
|
| 107 |
+
gan_loss_weight_gen: '0'
|
| 108 |
+
gan_r1_reg_alpha: '0.1'
|
| 109 |
+
gan_r1_reg_weight: '0.0'
|
| 110 |
+
gan_use_same_t_noise: 'False'
|
| 111 |
+
grad_scaler_enabled: 'False'
|
| 112 |
+
grad_scaler_growth_interval: '2000'
|
| 113 |
+
grad_scaler_init_scale: '65536.0'
|
| 114 |
+
guidance_scale: '4.5'
|
| 115 |
+
input_shape:
|
| 116 |
+
- '16'
|
| 117 |
+
- '21'
|
| 118 |
+
- '64'
|
| 119 |
+
- '64'
|
| 120 |
+
last_step_only: 'False'
|
| 121 |
+
load_student_weights: 'False'
|
| 122 |
+
net:
|
| 123 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
|
| 124 |
+
audio_hidden_size: '32'
|
| 125 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 126 |
+
chunk_size: '3'
|
| 127 |
+
in_dim: '65'
|
| 128 |
+
mode: v2v
|
| 129 |
+
model_size: 1.3B
|
| 130 |
+
net_pred_type: flow
|
| 131 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
|
| 132 |
+
schedule_type: rf
|
| 133 |
+
total_num_frames: '21'
|
| 134 |
+
use_audio: 'True'
|
| 135 |
+
net_optimizer:
|
| 136 |
+
_target_: <function get_optimizer at 0x7feb4c86a660>
|
| 137 |
+
betas:
|
| 138 |
+
- '0.0'
|
| 139 |
+
- '0.999'
|
| 140 |
+
eps: 1e-08
|
| 141 |
+
fused: 'False'
|
| 142 |
+
lr: 2e-06
|
| 143 |
+
model: null
|
| 144 |
+
optim_type: adamw
|
| 145 |
+
weight_decay: '0.01'
|
| 146 |
+
net_scheduler:
|
| 147 |
+
_target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
|
| 148 |
+
cycle_lengths:
|
| 149 |
+
- '10000000000'
|
| 150 |
+
f_max:
|
| 151 |
+
- '1.0'
|
| 152 |
+
f_min:
|
| 153 |
+
- '1.0'
|
| 154 |
+
f_start:
|
| 155 |
+
- 1e-06
|
| 156 |
+
warm_up_steps:
|
| 157 |
+
- '0'
|
| 158 |
+
precision: bfloat16
|
| 159 |
+
precision_amp: null
|
| 160 |
+
precision_amp_enc: null
|
| 161 |
+
precision_amp_infer: null
|
| 162 |
+
precision_fsdp: bfloat16
|
| 163 |
+
pretrained_model_path: ''
|
| 164 |
+
pretrained_student_net_path: ''
|
| 165 |
+
same_step_across_blocks: 'True'
|
| 166 |
+
sample_t_cfg:
|
| 167 |
+
log_t_df: '0.01'
|
| 168 |
+
max_t: '0.999'
|
| 169 |
+
min_t: '0.001'
|
| 170 |
+
shift: '5.0'
|
| 171 |
+
t_list:
|
| 172 |
+
- '0.999'
|
| 173 |
+
- '0.937'
|
| 174 |
+
- '0.833'
|
| 175 |
+
- '0.624'
|
| 176 |
+
- '0.0'
|
| 177 |
+
time_dist_type: shifted
|
| 178 |
+
train_p_mean: '-1.1'
|
| 179 |
+
train_p_std: '2.0'
|
| 180 |
+
skip_layers: null
|
| 181 |
+
start_gradient_frame: '0'
|
| 182 |
+
student_sample_steps: '4'
|
| 183 |
+
student_sample_type: sde
|
| 184 |
+
student_update_freq: '5'
|
| 185 |
+
teacher:
|
| 186 |
+
_target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
|
| 187 |
+
audio_hidden_size: '32'
|
| 188 |
+
base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
|
| 189 |
+
in_dim: '65'
|
| 190 |
+
merge_lora: 'True'
|
| 191 |
+
mode: v2v
|
| 192 |
+
model_size: 14B
|
| 193 |
+
net_pred_type: flow
|
| 194 |
+
omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
|
| 195 |
+
schedule_type: rf
|
| 196 |
+
use_audio: 'True'
|
| 197 |
+
use_ema: 'False'
|
| 198 |
+
model_class:
|
| 199 |
+
_target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
|
| 200 |
+
config: null
|
| 201 |
+
trainer:
|
| 202 |
+
augment_pipe: null
|
| 203 |
+
batch_size_global: null
|
| 204 |
+
callbacks:
|
| 205 |
+
ema:
|
| 206 |
+
_target_: <class 'fastgen.callbacks.ema.EMACallback'>
|
| 207 |
+
beta: '0.9999'
|
| 208 |
+
ema_halflife_kimg: '500'
|
| 209 |
+
ema_rampup_ratio: '0.05'
|
| 210 |
+
gamma: '16.97'
|
| 211 |
+
start_iter: '0'
|
| 212 |
+
type: constant
|
| 213 |
+
gpu_stats:
|
| 214 |
+
_target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
|
| 215 |
+
every_n: '100'
|
| 216 |
+
grad_clip:
|
| 217 |
+
_target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
|
| 218 |
+
grad_norm: '10.0'
|
| 219 |
+
model_key: net
|
| 220 |
+
param_count:
|
| 221 |
+
_target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
|
| 222 |
+
train_profiler:
|
| 223 |
+
_target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
|
| 224 |
+
every_n: '100'
|
| 225 |
+
wandb:
|
| 226 |
+
_target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
|
| 227 |
+
fps: '25'
|
| 228 |
+
sample_logging_iter: '100'
|
| 229 |
+
checkpointer:
|
| 230 |
+
pretrained_ckpt_key_map:
|
| 231 |
+
net: net
|
| 232 |
+
pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
|
| 233 |
+
s3_container: s3://checkpoints/fastgen
|
| 234 |
+
s3_credential: ./credentials/s3.json
|
| 235 |
+
save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/checkpoints
|
| 236 |
+
use_s3: 'False'
|
| 237 |
+
cudnn:
|
| 238 |
+
benchmark: 'True'
|
| 239 |
+
deterministic: 'False'
|
| 240 |
+
ddp: 'False'
|
| 241 |
+
fsdp: 'True'
|
| 242 |
+
fsdp_cpu_offload: 'False'
|
| 243 |
+
fsdp_min_num_params: '10000000'
|
| 244 |
+
fsdp_sharding_group_size: null
|
| 245 |
+
global_vars: null
|
| 246 |
+
global_vars_val:
|
| 247 |
+
- null
|
| 248 |
+
grad_accum_rounds: '2'
|
| 249 |
+
logging_iter: '1'
|
| 250 |
+
max_iter: '10'
|
| 251 |
+
offload_module_in_decoding: 'False'
|
| 252 |
+
resume: 'False'
|
| 253 |
+
save_ckpt_iter: '100'
|
| 254 |
+
seed: '0'
|
| 255 |
+
skip_initial_validation: 'True'
|
| 256 |
+
tf32_enabled: 'True'
|
| 257 |
+
val_seed: null
|
| 258 |
+
validation_iter: '100'
|
| 259 |
+
visualize_teacher: 'False'
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68cfb6be85ca8bb4cb2c99e580d579759b315332fe04ff9583e8a17503710b70
|
| 3 |
+
size 614328
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__0_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58d137502c435eeaa445c868ec414ada748effb592f340238578f82a40be7a9c
|
| 3 |
+
size 1424446005
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__1_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d60a72b6714edac135314634cae333779c2cdc040ac2bae1b34bfe521981579d
|
| 3 |
+
size 1401881227
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__2_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c33b96f7c9d96d1b61c2817be29f21a1fffbc41eef2ba6d7e4d7d4bb3eb2543
|
| 3 |
+
size 1406759189
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__3_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c16e8089a70fb63a6ef5e83f93b2bc5ef002d78b3e8f1c61ae1fd9386e18db0b
|
| 3 |
+
size 1401210558
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c9e7bf7b483afea9b5f5fc9f6d1a8368eea2246e5e5404c08b9fba8acdc0064
|
| 3 |
+
size 1901
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aa773eb266887bcf43277c27dddf54d6f73581fb8a49aefd509ff3fd7699d8b
|
| 3 |
+
size 614328
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__0_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f79a6720f3855aca52a768cc9b64766d16fbc6af95c459f3d236fee4056a8b2
|
| 3 |
+
size 1424446005
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__1_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4e4b4d3a27464533ae9b16fe7eb25319df4483379605ce23676ce66ee538e9c
|
| 3 |
+
size 1401881227
|
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__2_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c6fafdbb008d602689cc51bdb7811e31fdab406e70859f6f647970016bfde2a
|
| 3 |
+
size 1406759189
|