paulcho98 commited on
Commit
10153ba
·
verified ·
1 Parent(s): 7fc3985

Add FASTGEN_SF_OUTPUT

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -0
  2. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/config.yaml +259 -0
  3. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug-internal.log +6 -0
  4. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug.log +19 -0
  5. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/output.log +0 -0
  6. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/requirements.txt +222 -0
  7. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/wandb-metadata.json +68 -0
  8. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-core.log +8 -0
  9. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log +6 -0
  10. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log +19 -0
  11. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/run-zmgbhqqw.wandb +3 -0
  12. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb_id.txt +1 -0
  13. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/config.yaml +259 -0
  14. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug-internal.log +6 -0
  15. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug.log +19 -0
  16. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/output.log +739 -0
  17. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/requirements.txt +222 -0
  18. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/wandb-metadata.json +68 -0
  19. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-core.log +8 -0
  20. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log +6 -0
  21. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log +19 -0
  22. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/run-nkf4iovm.wandb +3 -0
  23. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb_id.txt +1 -0
  24. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/config.yaml +259 -0
  25. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/config.yaml +259 -0
  26. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/wandb_id.txt +1 -0
  27. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/config.yaml +259 -0
  28. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug-internal.log +11 -0
  29. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug.log +21 -0
  30. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/config.yaml +362 -0
  31. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/output.log +310 -0
  32. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/requirements.txt +222 -0
  33. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-metadata.json +69 -0
  34. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-summary.json +1 -0
  35. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-core.log +15 -0
  36. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log +11 -0
  37. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log +21 -0
  38. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/run-spcd04xe.wandb +3 -0
  39. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb_id.txt +1 -0
  40. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/config.yaml +259 -0
  41. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/.metadata +3 -0
  42. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__0_0.distcp +3 -0
  43. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__1_0.distcp +3 -0
  44. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__2_0.distcp +3 -0
  45. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__3_0.distcp +3 -0
  46. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.pth +3 -0
  47. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/.metadata +3 -0
  48. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__0_0.distcp +3 -0
  49. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__1_0.distcp +3 -0
  50. FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__2_0.distcp +3 -0
.gitattributes CHANGED
The diff for this file is too large to render. See raw diff
 
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7fa0fef8fb00>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_4gpu_bs8_5000iter_shift5
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: online
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7fa0fefee660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7fa0fefee660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7fa0fefee660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '5000'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T20:58:42.181919016+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-02T20:58:42.574143607+09:00","level":"INFO","msg":"stream: created new stream","id":"zmgbhqqw"}
3
+ {"time":"2026-04-02T20:58:42.574192032+09:00","level":"INFO","msg":"handler: started","stream_id":"zmgbhqqw"}
4
+ {"time":"2026-04-02T20:58:42.574261225+09:00","level":"INFO","msg":"stream: started","id":"zmgbhqqw"}
5
+ {"time":"2026-04-02T20:58:42.574281395+09:00","level":"INFO","msg":"writer: started","stream_id":"zmgbhqqw"}
6
+ {"time":"2026-04-02T20:58:42.57428521+09:00","level":"INFO","msg":"sender: started","stream_id":"zmgbhqqw"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Configure stats pid to 549927
3
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log
5
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log
6
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_5000iter_shift5', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7fbeacc8fb00>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():892] starting backend
10
+ 2026-04-02 20:58:42,167 INFO MainThread:549927 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-02 20:58:42,179 INFO MainThread:549927 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-02 20:58:42,183 INFO MainThread:549927 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-02 20:58:42,199 INFO MainThread:549927 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-02 20:58:43,281 INFO MainThread:549927 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-02 20:58:43,513 INFO MainThread:549927 [wandb_init.py:init():1082] run started, returning control to user process
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/requirements.txt ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastgen==0.1.0
2
+ nvitop==1.6.1
3
+ ftfy==6.3.1
4
+ braceexpand==0.1.7
5
+ antlr4-python3-runtime==4.9.3
6
+ webdataset==1.0.2
7
+ sentry-sdk==2.53.0
8
+ rdkit==2025.9.5
9
+ python-dotenv==1.2.1
10
+ proglog==0.1.12
11
+ omegaconf==2.3.0
12
+ narwhals==2.17.0
13
+ loguru==0.7.3
14
+ imageio-ffmpeg==0.6.0
15
+ plotly==6.5.2
16
+ moviepy==2.2.1
17
+ hydra-core==1.3.2
18
+ wandb==0.25.0
19
+ fastgen==0.1.0
20
+ packaging==25.0
21
+ setuptools==80.10.2
22
+ wheel==0.46.3
23
+ pip==26.0.1
24
+ webencodings==0.5.1
25
+ pure_eval==0.2.3
26
+ ptyprocess==0.7.0
27
+ nvidia-ml-py==13.590.48
28
+ nvidia-cusparselt-cu12==0.7.1
29
+ mpmath==1.3.0
30
+ fastjsonschema==2.21.2
31
+ zipp==3.23.0
32
+ xyzservices==2025.11.0
33
+ widgetsnbextension==4.0.15
34
+ websocket-client==1.9.0
35
+ webcolors==25.10.0
36
+ wcwidth==0.6.0
37
+ urllib3==2.6.3
38
+ uri-template==1.3.0
39
+ tzdata==2025.3
40
+ typing_extensions==4.15.0
41
+ triton==3.6.0
42
+ traitlets==5.14.3
43
+ tqdm==4.67.3
44
+ tornado==6.5.5
45
+ tinycss2==1.4.0
46
+ sympy==1.14.0
47
+ soupsieve==2.8.3
48
+ smmap==5.0.3
49
+ six==1.16.0
50
+ sentencepiece==0.2.1
51
+ Send2Trash==2.1.0
52
+ safetensors==0.7.0
53
+ rpds-py==0.30.0
54
+ rfc3986-validator==0.1.1
55
+ regex==2026.2.28
56
+ pyzmq==27.1.0
57
+ PyYAML==6.0.3
58
+ python-json-logger==4.0.0
59
+ Pygments==2.19.2
60
+ pycparser==3.0
61
+ psutil==7.2.2
62
+ protobuf==4.24.4
63
+ prometheus_client==0.24.1
64
+ platformdirs==4.9.4
65
+ pillow==11.3.0
66
+ pexpect==4.9.0
67
+ parso==0.8.6
68
+ pandocfilters==1.5.1
69
+ nvidia-nvtx-cu12==12.8.90
70
+ nvidia-nvshmem-cu12==3.4.5
71
+ nvidia-nvjitlink-cu12==12.8.93
72
+ nvidia-nccl-cu12==2.27.5
73
+ nvidia-curand-cu12==10.3.9.90
74
+ nvidia-cufile-cu12==1.13.1.3
75
+ nvidia-cuda-runtime-cu12==12.8.90
76
+ nvidia-cuda-nvrtc-cu12==12.8.93
77
+ nvidia-cuda-cupti-cu12==12.8.90
78
+ nvidia-cublas-cu12==12.8.4.1
79
+ numpy==1.26.4
80
+ networkx==3.6.1
81
+ nest-asyncio==1.6.0
82
+ mistune==3.2.0
83
+ MarkupSafe==3.0.3
84
+ lark==1.3.1
85
+ jupyterlab_widgets==3.0.16
86
+ jupyterlab_pygments==0.3.0
87
+ jsonpointer==3.0.0
88
+ json5==0.13.0
89
+ jmespath==1.1.0
90
+ idna==3.11
91
+ hf-xet==1.4.2
92
+ h11==0.16.0
93
+ fsspec==2026.2.0
94
+ fqdn==1.5.1
95
+ filelock==3.25.2
96
+ executing==2.2.1
97
+ einops==0.8.2
98
+ defusedxml==0.7.1
99
+ decorator==5.2.1
100
+ debugpy==1.8.20
101
+ cuda-pathfinder==1.4.2
102
+ comm==0.2.3
103
+ click==8.3.1
104
+ charset-normalizer==3.4.5
105
+ certifi==2026.2.25
106
+ bleach==6.3.0
107
+ babel==2.18.0
108
+ av==17.0.0
109
+ attrs==25.4.0
110
+ async-lru==2.2.0
111
+ asttokens==3.0.1
112
+ annotated-types==0.7.0
113
+ typing-inspection==0.4.2
114
+ terminado==0.18.1
115
+ stack-data==0.6.3
116
+ scipy==1.17.1
117
+ rfc3987-syntax==1.1.0
118
+ rfc3339-validator==0.1.4
119
+ requests==2.32.5
120
+ referencing==0.37.0
121
+ python-dateutil==2.9.0.post0
122
+ pydantic_core==2.41.5
123
+ prompt_toolkit==3.0.52
124
+ opencv-python-headless==4.11.0.86
125
+ nvidia-cusparse-cu12==12.5.8.93
126
+ nvidia-cufft-cu12==11.3.3.83
127
+ nvidia-cudnn-cu12==9.10.2.21
128
+ matplotlib-inline==0.2.1
129
+ jupyter_core==5.9.1
130
+ Jinja2==3.1.6
131
+ jedi==0.19.2
132
+ ipython_pygments_lexers==1.1.1
133
+ importlib_metadata==8.7.1
134
+ ImageIO==2.37.3
135
+ httpcore==1.0.9
136
+ gitdb==4.0.12
137
+ cuda-bindings==12.9.4
138
+ contourpy==1.3.3
139
+ cffi==2.0.0
140
+ beautifulsoup4==4.14.3
141
+ anyio==4.12.1
142
+ soundfile==0.13.1
143
+ pydantic==2.12.5
144
+ nvidia-cusolver-cu12==11.7.3.90
145
+ jupyter_server_terminals==0.5.4
146
+ jupyter_client==8.8.0
147
+ jsonschema-specifications==2025.9.1
148
+ ipython==9.11.0
149
+ httpx==0.28.1
150
+ GitPython==3.1.46
151
+ botocore==1.42.68
152
+ bokeh==3.9.0
153
+ arrow==1.4.0
154
+ argon2-cffi-bindings==25.1.0
155
+ torch==2.10.0
156
+ s3transfer==0.16.0
157
+ jsonschema==4.26.0
158
+ isoduration==20.11.0
159
+ ipywidgets==8.1.8
160
+ ipykernel==7.2.0
161
+ argon2-cffi==25.1.0
162
+ torchvision==0.25.0
163
+ nbformat==5.10.4
164
+ jupyter-console==6.6.3
165
+ boto3==1.42.68
166
+ accelerate==1.13.0
167
+ nbclient==0.10.4
168
+ jupyter-events==0.12.0
169
+ nbconvert==7.17.0
170
+ jupyter_server==2.17.0
171
+ notebook_shim==0.2.4
172
+ jupyterlab_server==2.28.0
173
+ jupyter-lsp==2.3.0
174
+ jupyterlab==4.5.6
175
+ notebook==7.5.5
176
+ jupyter==1.1.1
177
+ fastgen==0.1.0
178
+ pandas==3.0.1
179
+ shellingham==1.5.4
180
+ mdurl==0.1.2
181
+ annotated-doc==0.0.4
182
+ markdown-it-py==4.0.0
183
+ rich==14.3.3
184
+ typer==0.24.1
185
+ huggingface_hub==1.7.1
186
+ timm==1.0.25
187
+ tokenizers==0.22.2
188
+ diffusers==0.37.0
189
+ transformers==5.3.0
190
+ peft==0.18.1
191
+ easydict==1.13
192
+ lmdb==2.2.0
193
+ threadpoolctl==3.6.0
194
+ soxr==1.0.0
195
+ msgpack==1.1.2
196
+ llvmlite==0.47.0
197
+ lazy-loader==0.5
198
+ joblib==1.5.3
199
+ audioread==3.1.0
200
+ scikit-learn==1.8.0
201
+ pooch==1.9.0
202
+ numba==0.65.0
203
+ librosa==0.11.0
204
+ simsimd==6.5.16
205
+ flatbuffers==25.12.19
206
+ tifffile==2026.3.3
207
+ stringzilla==4.6.0
208
+ pyparsing==3.3.2
209
+ prettytable==3.17.0
210
+ onnx==1.17.0
211
+ kiwisolver==1.5.0
212
+ fonttools==4.62.1
213
+ Cython==3.2.4
214
+ cycler==0.12.1
215
+ scikit-image==0.26.0
216
+ onnxruntime==1.24.4
217
+ matplotlib==3.10.8
218
+ albucore==0.0.24
219
+ albumentations==2.0.8
220
+ insightface==0.7.3
221
+ kornia_rs==0.1.10
222
+ kornia==0.8.2
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/files/wandb-metadata.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.12",
4
+ "startedAt": "2026-04-02T11:58:41.929718Z",
5
+ "args": [
6
+ "--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
7
+ "-",
8
+ "trainer.resume=False",
9
+ "log_config.name=sf_4gpu_bs8_5000iter_shift5",
10
+ "log_config.project=OmniAvatar-FastGen"
11
+ ],
12
+ "program": "/home/work/.local/hyunbin/FastGen/train.py",
13
+ "codePath": "train.py",
14
+ "codePathLocal": "train.py",
15
+ "git": {
16
+ "remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
17
+ "commit": "dead092792003faa07babff77ccd223af4ad9b11"
18
+ },
19
+ "email": "paul.hyunbin@gmail.com",
20
+ "root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5",
21
+ "host": "main1",
22
+ "executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
23
+ "cpu_count": 112,
24
+ "cpu_count_logical": 224,
25
+ "gpu": "NVIDIA H200",
26
+ "gpu_count": 4,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1356758433792",
30
+ "used": "257961558016"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "2163961778176"
35
+ },
36
+ "gpu_nvidia": [
37
+ {
38
+ "name": "NVIDIA H200",
39
+ "memoryTotal": "150754820096",
40
+ "cudaCores": 16896,
41
+ "architecture": "Hopper",
42
+ "uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
43
+ },
44
+ {
45
+ "name": "NVIDIA H200",
46
+ "memoryTotal": "150754820096",
47
+ "cudaCores": 16896,
48
+ "architecture": "Hopper",
49
+ "uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
50
+ },
51
+ {
52
+ "name": "NVIDIA H200",
53
+ "memoryTotal": "150754820096",
54
+ "cudaCores": 16896,
55
+ "architecture": "Hopper",
56
+ "uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
57
+ },
58
+ {
59
+ "name": "NVIDIA H200",
60
+ "memoryTotal": "150754820096",
61
+ "cudaCores": 16896,
62
+ "architecture": "Hopper",
63
+ "uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
64
+ }
65
+ ],
66
+ "cudaVersion": "12.9",
67
+ "writerId": "n2ybi81tgd0arslahhy2n7g532wc0pja"
68
+ }
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-core.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T20:58:41.981092557+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpwwrvdk3u/port-549927.txt","pid":549927,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-02T20:58:41.98157225+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":549927}
3
+ {"time":"2026-04-02T20:58:41.981562902+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-549927-558790-3160372452/socket","Net":"unix"}}
4
+ {"time":"2026-04-02T20:58:42.16745332+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-02T20:58:42.181823375+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"zmgbhqqw","id":"1(@)"}
6
+ {"time":"2026-04-02T20:58:42.574268009+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"zmgbhqqw","id":"1(@)"}
7
+ {"time":"2026-04-02T20:58:48.512451301+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"6lpqhxkfa0yx"}
8
+ {"time":"2026-04-02T21:41:09.853455246+09:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T20:58:42.181919016+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-02T20:58:42.574143607+09:00","level":"INFO","msg":"stream: created new stream","id":"zmgbhqqw"}
3
+ {"time":"2026-04-02T20:58:42.574192032+09:00","level":"INFO","msg":"handler: started","stream_id":"zmgbhqqw"}
4
+ {"time":"2026-04-02T20:58:42.574261225+09:00","level":"INFO","msg":"stream: started","id":"zmgbhqqw"}
5
+ {"time":"2026-04-02T20:58:42.574281395+09:00","level":"INFO","msg":"writer: started","stream_id":"zmgbhqqw"}
6
+ {"time":"2026-04-02T20:58:42.57428521+09:00","level":"INFO","msg":"sender: started","stream_id":"zmgbhqqw"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Configure stats pid to 549927
3
+ 2026-04-02 20:58:41,930 INFO MainThread:549927 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug.log
5
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/logs/debug-internal.log
6
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_5000iter_shift5', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7fbeacc8fb00>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7fbeacee6660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-02 20:58:41,931 INFO MainThread:549927 [wandb_init.py:init():892] starting backend
10
+ 2026-04-02 20:58:42,167 INFO MainThread:549927 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-02 20:58:42,179 INFO MainThread:549927 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-02 20:58:42,183 INFO MainThread:549927 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-02 20:58:42,199 INFO MainThread:549927 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-02 20:58:43,281 INFO MainThread:549927 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-02 20:58:43,509 INFO MainThread:549927 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-02 20:58:43,513 INFO MainThread:549927 [wandb_init.py:init():1082] run started, returning control to user process
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb/run-20260402_205841-zmgbhqqw/run-zmgbhqqw.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7d6105253ff0592490ce0e6460f37480df990ab3d55586489772eaeeb75f982
3
+ size 688128
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_5000iter_shift5/wandb_id.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ zmgbhqqw
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7fc7a89d7ce0>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_4gpu_bs8_lr2e6_5000iter_shift5_combined
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: online
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7fc7a8c2e660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7fc7a8c2e660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7fc7a8c2e660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '5000'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-04-03T00:01:32.430639522+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-03T00:01:32.800089951+09:00","level":"INFO","msg":"stream: created new stream","id":"nkf4iovm"}
3
+ {"time":"2026-04-03T00:01:32.800139938+09:00","level":"INFO","msg":"handler: started","stream_id":"nkf4iovm"}
4
+ {"time":"2026-04-03T00:01:32.800233729+09:00","level":"INFO","msg":"stream: started","id":"nkf4iovm"}
5
+ {"time":"2026-04-03T00:01:32.80025365+09:00","level":"INFO","msg":"sender: started","stream_id":"nkf4iovm"}
6
+ {"time":"2026-04-03T00:01:32.800252986+09:00","level":"INFO","msg":"writer: started","stream_id":"nkf4iovm"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Configure stats pid to 1047116
3
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log
5
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log
6
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_lr2e6_5000iter_shift5_combined', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7f328019bce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():892] starting backend
10
+ 2026-04-03 00:01:32,416 INFO MainThread:1047116 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-03 00:01:32,428 INFO MainThread:1047116 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-03 00:01:32,432 INFO MainThread:1047116 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-03 00:01:32,448 INFO MainThread:1047116 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-03 00:01:33,668 INFO MainThread:1047116 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-03 00:01:33,986 INFO MainThread:1047116 [wandb_init.py:init():1082] run started, returning control to user process
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/output.log ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.callbacks.wandb:init_wandb:259 ] Wandb code upload disabled (set WANDB_UPLOAD_CODE=true to enable)
2
+ [Apr 3, 2026 - 00:01:33 | SUCCESS | fastgen.trainer:__init__:53 ] Callbacks initialized successfully
3
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:__init__:57 ] Callback synchronization complete
4
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:__init__:60 ] Initializing checkpointer...
5
+ [Apr 3, 2026 - 00:01:33 | SUCCESS | fastgen.trainer:__init__:65 ] Checkpointer initialized successfully
6
+ [Apr 3, 2026 - 00:01:33 | SUCCESS | __main__:main:33 ] Trainer initialized successfully
7
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:run:77 ] Starting training
8
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.trainer:run:80 ] Initializing callbacks and model ...
9
+ [Apr 3, 2026 - 00:01:33 | INFO | fastgen.utils.checkpointer:load:151 ] Loading model from /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
10
+ [Apr 3, 2026 - 00:01:38 | INFO | fastgen.utils.checkpointer:load:154 ] Loading the model_dict...
11
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.checkpointer:load:159 ] Model net, loading info: <All keys matched successfully>
12
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:load_pretrained_ckpt:252 ] Loaded net model from net in /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth at iteration 5000
13
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:load_pretrained_ckpt:262 ] Setting resume_iter for model to 5000.
14
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:95 ] Starting model.on_train_begin ...
15
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.methods.model:on_train_begin:296 ] Teacher check: add_teacher_to_fsdp_dict=True, fsdp_dict keys=['net', 'fake_score', 'teacher'], teacher in fsdp_dict=True
16
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:99 ] model.on_train_begin completed
17
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.trainer:run:110 ] Wrapping model into fsdp ..
18
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:102 ] Fully sharding model with 4 ranks...
19
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'net' (1.42B params)...
20
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.networks.OmniAvatar.network_causal:fully_shard:1950 ] CausalOmniAvatarWan: keeping manual gradient checkpointing (not using apply_fsdp_checkpointing due to KV cache dynamics)
21
+ [Apr 3, 2026 - 00:01:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
22
+ [Apr 3, 2026 - 00:01:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped net in 1.1s
23
+ [Apr 3, 2026 - 00:01:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'fake_score' (1.60B params)...
24
+ [Apr 3, 2026 - 00:01:51 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
25
+ [Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
26
+ [Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped fake_score in 1.5s
27
+ [Apr 3, 2026 - 00:01:52 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'teacher' (14.29B params)...
28
+ [Apr 3, 2026 - 00:01:52 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
29
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
30
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped teacher in 10.9s
31
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:118 ] FSDP wrapping completed
32
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.callbacks.ema:on_model_init_end:64 ] EMA ema is not enabled, skipping callback.
33
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:133 ] Auto-Resume Details: None
34
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.utils.basic_utils:set_random_seed:144 ] Using random seed 0.
35
+ [Apr 3, 2026 - 00:02:03 | INFO | fastgen.trainer:run:165 ] Instantiating dataloader...
36
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.wandb:on_dataloader_init_end:361 ] SKIP_GT_VAL_UPLOAD=1 — skipping GT val video upload
37
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.gpu_stats:on_train_begin:57 ] every_n to measure gpus stats: 1
38
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.train_profiler:on_train_begin:54 ] every_n to profile trainer: 1
39
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] model (OmniAvatarSelfForcingModel) has 1596.36 M trainable and 17311.83 M total params (logical).
40
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.42 M trainable and 4350.43 M total params LOCAL on rank 0.
41
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 1.
42
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 2.
43
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 3.
44
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] net (CausalOmniAvatarWan) has 1421.38 M trainable and 1421.38 M total params (logical).
45
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.68 M trainable and 376.68 M total params LOCAL on rank 0.
46
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 1.
47
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 2.
48
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 3.
49
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] fake_score (OmniAvatarWan) has 174.98 M trainable and 1596.36 M total params (logical).
50
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.30 M total params LOCAL on rank 0.
51
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 1.
52
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 2.
53
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 3.
54
+ [Apr 3, 2026 - 00:02:09 | INFO | fastgen.trainer:run:174 ] iter_start: 0
55
+ [MEM] fake_score_update: START: alloc=9.45GB reserved=9.88GB peak=9.60GB
56
+ [MEM] fake_score_update: after student gen (no_grad): alloc=12.28GB reserved=49.39GB peak=45.74GB
57
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=13.16GB peak=45.74GB
58
+ [MEM-fwd] after block 0: alloc=13.74GB peak=45.74GB
59
+ [MEM-fwd] after block 10: alloc=19.55GB peak=45.74GB
60
+ [MEM-fwd] after block 20: alloc=24.84GB peak=45.74GB
61
+ [MEM-fwd] after block 29: alloc=29.59GB peak=45.74GB
62
+ [MEM-fwd] after head+unpatchify: alloc=30.67GB peak=45.74GB
63
+ [MEM] fake_score_update: START: alloc=13.25GB reserved=15.24GB peak=54.53GB
64
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.29GB reserved=47.38GB peak=46.75GB
65
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.17GB peak=46.75GB
66
+ [MEM-fwd] after block 0: alloc=14.75GB peak=46.75GB
67
+ [MEM-fwd] after block 10: alloc=20.56GB peak=46.75GB
68
+ [MEM-fwd] after block 20: alloc=25.85GB peak=46.75GB
69
+ [MEM-fwd] after block 29: alloc=30.61GB peak=46.75GB
70
+ [MEM-fwd] after head+unpatchify: alloc=31.69GB peak=46.75GB
71
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
72
+ Avg Max Min
73
+ cpu_mem_gb 38.485269 38.565594 38.411797
74
+ peak_gpu_mem_gb 51.700073 51.700073 51.700073
75
+ peak_gpu_mem_reserved_gb 53.640625 53.640625 53.640625
76
+ util 89.250000 96.000000 84.000000
77
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 1 : data loading time 0.81
78
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 1 : avg forward pass time 15.10
79
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 1 : backward pass time 11.36
80
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 1 : optimizer step time 1.27
81
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 1--------------------
82
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0068 iter count: 1.0
83
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0068 iter count: 1.0
84
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
85
+ [Apr 3, 2026 - 00:03:06 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
86
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=57.60GB peak=55.51GB
87
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.47GB peak=46.93GB
88
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
89
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
90
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
91
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
92
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
93
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
94
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.37GB peak=55.69GB
95
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.25GB peak=47.71GB
96
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.71GB
97
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.71GB
98
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.71GB
99
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.71GB
100
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.71GB
101
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.71GB
102
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
103
+ Avg Max Min
104
+ cpu_mem_gb 38.658718 38.739353 38.585140
105
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
106
+ peak_gpu_mem_reserved_gb 54.365234 54.365234 54.365234
107
+ util 95.750000 100.000000 92.000000
108
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 2 : avg iteration time 51.59 seconds
109
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 2 : data loading time 0.00
110
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 2 : avg forward pass time 13.82
111
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 2 : backward pass time 11.41
112
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 2 : optimizer step time 1.17
113
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 2--------------------
114
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
115
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
116
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
117
+ [Apr 3, 2026 - 00:03:58 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
118
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.37GB peak=56.47GB
119
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
120
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
121
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
122
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
123
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
124
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
125
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
126
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.40GB peak=55.69GB
127
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.29GB peak=47.72GB
128
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
129
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
130
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
131
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
132
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
133
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
134
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
135
+ Avg Max Min
136
+ cpu_mem_gb 38.659063 38.740227 38.585308
137
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
138
+ peak_gpu_mem_reserved_gb 54.404297 54.404297 54.404297
139
+ util 96.750000 100.000000 92.000000
140
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 3 : avg iteration time 58.07 seconds
141
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 3 : data loading time 0.00
142
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 3 : avg forward pass time 17.04
143
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 3 : backward pass time 11.38
144
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 3 : optimizer step time 1.18
145
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 3--------------------
146
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
147
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
148
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
149
+ [Apr 3, 2026 - 00:04:56 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
150
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.42GB peak=56.47GB
151
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.52GB peak=46.93GB
152
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
153
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
154
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
155
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
156
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
157
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
158
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.42GB peak=55.69GB
159
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.27GB peak=47.72GB
160
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
161
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
162
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
163
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
164
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
165
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
166
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
167
+ Avg Max Min
168
+ cpu_mem_gb 38.659183 38.740608 38.585339
169
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
170
+ peak_gpu_mem_reserved_gb 54.443359 54.443359 54.443359
171
+ util 95.250000 100.000000 91.000000
172
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 4 : avg iteration time 57.93 seconds
173
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 4 : data loading time 0.00
174
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 4 : avg forward pass time 17.00
175
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 4 : backward pass time 11.36
176
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 4 : optimizer step time 1.18
177
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 4--------------------
178
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
179
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
180
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
181
+ [Apr 3, 2026 - 00:05:54 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
182
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.46GB peak=56.47GB
183
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
184
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
185
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
186
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
187
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
188
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
189
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
190
+ [MEM] student_update: START: alloc=14.32GB reserved=58.46GB peak=55.75GB
191
+ [MEM] student_update: after rollout: alloc=63.85GB reserved=66.24GB peak=65.62GB
192
+ [MEM] student_update: after perturb: alloc=63.87GB reserved=66.24GB peak=65.62GB
193
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=64.62GB peak=65.62GB
194
+ [MEM-fwd] after block 0: alloc=64.67GB peak=75.08GB
195
+ [MEM-fwd] after block 10: alloc=65.20GB peak=75.61GB
196
+ [MEM-fwd] after block 20: alloc=65.20GB peak=75.61GB
197
+ [MEM-fwd] after block 29: alloc=65.20GB peak=75.61GB
198
+ [MEM-fwd] after head+unpatchify: alloc=64.69GB peak=75.61GB
199
+ [MEM] student_update: after fake_score: alloc=63.95GB reserved=78.02GB peak=75.61GB
200
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.83GB peak=75.61GB
201
+ [MEM-fwd] after block 0: alloc=67.05GB peak=88.90GB
202
+ [MEM-fwd] after block 10: alloc=68.82GB peak=90.66GB
203
+ [MEM-fwd] after block 20: alloc=68.82GB peak=90.66GB
204
+ [MEM-fwd] after block 30: alloc=68.82GB peak=90.66GB
205
+ [MEM-fwd] after block 39: alloc=68.82GB peak=90.66GB
206
+ [MEM-fwd] after head+unpatchify: alloc=67.08GB peak=90.66GB
207
+ [MEM] student_update: after teacher: alloc=64.45GB reserved=93.79GB peak=90.66GB
208
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.37GB peak=90.66GB
209
+ [MEM-fwd] after block 0: alloc=67.08GB peak=90.66GB
210
+ [MEM-fwd] after block 10: alloc=68.84GB peak=90.68GB
211
+ [MEM-fwd] after block 20: alloc=68.84GB peak=90.68GB
212
+ [MEM-fwd] after block 30: alloc=68.84GB peak=90.68GB
213
+ [MEM-fwd] after block 39: alloc=68.84GB peak=90.68GB
214
+ [MEM-fwd] after head+unpatchify: alloc=67.10GB peak=90.68GB
215
+ [MEM] student_update: after CFG: alloc=64.45GB reserved=93.83GB peak=90.68GB
216
+ [MEM] student_update: after VSD loss: alloc=64.47GB reserved=93.83GB peak=90.68GB
217
+ [MEM] fake_score_update: START: alloc=12.68GB reserved=72.00GB peak=90.68GB
218
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.51GB reserved=52.62GB peak=48.98GB
219
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.28GB peak=48.98GB
220
+ [MEM-fwd] after block 0: alloc=16.91GB peak=48.98GB
221
+ [MEM-fwd] after block 10: alloc=22.73GB peak=48.98GB
222
+ [MEM-fwd] after block 20: alloc=28.01GB peak=48.98GB
223
+ [MEM-fwd] after block 29: alloc=32.77GB peak=48.98GB
224
+ [MEM-fwd] after head+unpatchify: alloc=33.85GB peak=48.98GB
225
+ [MEM] student_update: START: alloc=15.52GB reserved=61.80GB peak=56.95GB
226
+ [MEM] student_update: after rollout: alloc=65.05GB reserved=68.04GB peak=66.83GB
227
+ [MEM] student_update: after perturb: alloc=65.07GB reserved=68.04GB peak=66.83GB
228
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=65.82GB peak=66.83GB
229
+ [MEM-fwd] after block 0: alloc=65.87GB peak=76.28GB
230
+ [MEM-fwd] after block 10: alloc=66.40GB peak=76.81GB
231
+ [MEM-fwd] after block 20: alloc=66.40GB peak=76.81GB
232
+ [MEM-fwd] after block 29: alloc=66.40GB peak=76.81GB
233
+ [MEM-fwd] after head+unpatchify: alloc=65.90GB peak=76.81GB
234
+ [MEM] student_update: after fake_score: alloc=65.15GB reserved=79.81GB peak=76.81GB
235
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.08GB peak=76.81GB
236
+ [MEM-fwd] after block 0: alloc=67.78GB peak=89.62GB
237
+ [MEM-fwd] after block 10: alloc=69.54GB peak=91.39GB
238
+ [MEM-fwd] after block 20: alloc=69.54GB peak=91.39GB
239
+ [MEM-fwd] after block 30: alloc=69.54GB peak=91.39GB
240
+ [MEM-fwd] after block 39: alloc=69.54GB peak=91.39GB
241
+ [MEM-fwd] after head+unpatchify: alloc=67.80GB peak=91.39GB
242
+ [MEM] student_update: after teacher: alloc=65.17GB reserved=94.89GB peak=91.39GB
243
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.10GB peak=91.39GB
244
+ [MEM-fwd] after block 0: alloc=67.80GB peak=91.39GB
245
+ [MEM-fwd] after block 10: alloc=69.57GB peak=91.41GB
246
+ [MEM-fwd] after block 20: alloc=69.57GB peak=91.41GB
247
+ [MEM-fwd] after block 30: alloc=69.57GB peak=91.41GB
248
+ [MEM-fwd] after block 39: alloc=69.57GB peak=91.41GB
249
+ [MEM-fwd] after head+unpatchify: alloc=67.83GB peak=91.41GB
250
+ [MEM] student_update: after CFG: alloc=65.17GB reserved=94.91GB peak=91.41GB
251
+ [MEM] student_update: after VSD loss: alloc=65.20GB reserved=94.91GB peak=91.41GB
252
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
253
+ Avg Max Min
254
+ cpu_mem_gb 38.678225 38.757980 38.605282
255
+ peak_gpu_mem_gb 85.130531 85.130531 85.130531
256
+ peak_gpu_mem_reserved_gb 90.766602 90.771484 90.751953
257
+ util 67.500000 72.000000 61.000000
258
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 5 : avg iteration time 193.31 seconds
259
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 5 : data loading time 0.00
260
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 5 : avg forward pass time 86.26
261
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 5 : backward pass time 10.03
262
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 5 : optimizer step time 0.31
263
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 5--------------------
264
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.6670 iter count: 1.0
265
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.6670 iter count: 1.0
266
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
267
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0075 iter count: 1.0
268
+ [Apr 3, 2026 - 00:09:07 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
269
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=97.44GB peak=91.41GB
270
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.26GB reserved=53.38GB peak=49.73GB
271
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.03GB peak=49.73GB
272
+ [MEM-fwd] after block 0: alloc=17.66GB peak=49.73GB
273
+ [MEM-fwd] after block 10: alloc=23.48GB peak=49.73GB
274
+ [MEM-fwd] after block 20: alloc=28.76GB peak=49.73GB
275
+ [MEM-fwd] after block 29: alloc=33.52GB peak=49.73GB
276
+ [MEM-fwd] after head+unpatchify: alloc=34.60GB peak=49.73GB
277
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=62.77GB peak=58.43GB
278
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
279
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
280
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
281
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
282
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
283
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
284
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
285
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
286
+ Avg Max Min
287
+ cpu_mem_gb 38.678251 38.758018 38.605293
288
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
289
+ peak_gpu_mem_reserved_gb 57.687500 58.478516 55.314453
290
+ util 98.500000 100.000000 96.000000
291
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 6 : avg iteration time 52.27 seconds
292
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 6 : data loading time 0.00
293
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 6 : avg forward pass time 14.22
294
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 6 : backward pass time 11.33
295
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 6 : optimizer step time 1.17
296
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 6--------------------
297
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
298
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
299
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
300
+ [Apr 3, 2026 - 00:09:59 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
301
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.39GB peak=58.45GB
302
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.46GB peak=48.91GB
303
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
304
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
305
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
306
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
307
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
308
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
309
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.79GB peak=57.66GB
310
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
311
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
312
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
313
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
314
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
315
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
316
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
317
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
318
+ Avg Max Min
319
+ cpu_mem_gb 38.678332 38.758266 38.605301
320
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
321
+ peak_gpu_mem_reserved_gb 57.311523 57.853516 55.685547
322
+ util 97.250000 100.000000 92.000000
323
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 7 : avg iteration time 57.95 seconds
324
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 7 : data loading time 0.00
325
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 7 : avg forward pass time 17.05
326
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 7 : backward pass time 11.35
327
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 7 : optimizer step time 1.17
328
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 7--------------------
329
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0071 iter count: 1.0
330
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
331
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
332
+ [Apr 3, 2026 - 00:10:57 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
333
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.79GB peak=58.45GB
334
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
335
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
336
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
337
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
338
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
339
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
340
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
341
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.71GB peak=57.66GB
342
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.28GB peak=49.69GB
343
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
344
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
345
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
346
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
347
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
348
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
349
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
350
+ Avg Max Min
351
+ cpu_mem_gb 38.678370 38.758301 38.605301
352
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
353
+ peak_gpu_mem_reserved_gb 55.509766 55.685547 55.451172
354
+ util 97.500000 100.000000 95.000000
355
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 8 : avg iteration time 57.73 seconds
356
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 8 : data loading time 0.00
357
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 8 : avg forward pass time 16.95
358
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 8 : backward pass time 11.31
359
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 8 : optimizer step time 1.18
360
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 8--------------------
361
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0067 iter count: 1.0
362
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0067 iter count: 1.0
363
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
364
+ [Apr 3, 2026 - 00:11:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
365
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.79GB peak=58.45GB
366
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.48GB peak=48.91GB
367
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
368
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
369
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
370
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
371
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
372
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
373
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.81GB peak=57.66GB
374
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.30GB peak=49.69GB
375
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
376
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
377
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
378
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
379
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
380
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
381
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
382
+ Avg Max Min
383
+ cpu_mem_gb 38.678534 38.758778 38.605354
384
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
385
+ peak_gpu_mem_reserved_gb 55.543945 55.705078 55.490234
386
+ util 96.750000 99.000000 94.000000
387
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 9 : avg iteration time 61.18 seconds
388
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 9 : data loading time 0.00
389
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 9 : avg forward pass time 18.67
390
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 9 : backward pass time 11.33
391
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 9 : optimizer step time 1.18
392
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 9--------------------
393
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0071 iter count: 1.0
394
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
395
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
396
+ [Apr 3, 2026 - 00:12:56 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
397
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.81GB peak=58.45GB
398
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
399
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
400
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
401
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
402
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
403
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
404
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
405
+ [MEM] student_update: START: alloc=16.30GB reserved=59.83GB peak=57.72GB
406
+ [MEM] student_update: after rollout: alloc=65.82GB reserved=68.23GB peak=67.60GB
407
+ [MEM] student_update: after perturb: alloc=65.85GB reserved=68.23GB peak=67.60GB
408
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=66.60GB peak=67.60GB
409
+ [MEM-fwd] after block 0: alloc=66.64GB peak=77.05GB
410
+ [MEM-fwd] after block 10: alloc=67.17GB peak=77.58GB
411
+ [MEM-fwd] after block 20: alloc=67.17GB peak=77.58GB
412
+ [MEM-fwd] after block 29: alloc=67.17GB peak=77.58GB
413
+ [MEM-fwd] after head+unpatchify: alloc=66.67GB peak=77.58GB
414
+ [MEM] student_update: after fake_score: alloc=65.92GB reserved=79.54GB peak=77.58GB
415
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.85GB peak=77.58GB
416
+ [MEM-fwd] after block 0: alloc=68.55GB peak=90.40GB
417
+ [MEM-fwd] after block 10: alloc=70.31GB peak=92.16GB
418
+ [MEM-fwd] after block 20: alloc=70.31GB peak=92.16GB
419
+ [MEM-fwd] after block 30: alloc=70.31GB peak=92.16GB
420
+ [MEM-fwd] after block 39: alloc=70.31GB peak=92.16GB
421
+ [MEM-fwd] after head+unpatchify: alloc=68.58GB peak=92.16GB
422
+ [MEM] student_update: after teacher: alloc=65.95GB reserved=95.14GB peak=92.16GB
423
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.87GB peak=92.16GB
424
+ [MEM-fwd] after block 0: alloc=68.58GB peak=92.16GB
425
+ [MEM-fwd] after block 10: alloc=70.34GB peak=92.18GB
426
+ [MEM-fwd] after block 20: alloc=70.34GB peak=92.18GB
427
+ [MEM-fwd] after block 30: alloc=70.34GB peak=92.18GB
428
+ [MEM-fwd] after block 39: alloc=70.34GB peak=92.18GB
429
+ [MEM-fwd] after head+unpatchify: alloc=68.60GB peak=92.18GB
430
+ [MEM] student_update: after CFG: alloc=65.95GB reserved=95.16GB peak=92.18GB
431
+ [MEM] student_update: after VSD loss: alloc=65.97GB reserved=95.16GB peak=92.18GB
432
+ [MEM] fake_score_update: START: alloc=14.18GB reserved=98.71GB peak=92.18GB
433
+ [MEM] fake_score_update: after student gen (no_grad): alloc=17.01GB reserved=54.14GB peak=50.47GB
434
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.78GB peak=50.47GB
435
+ [MEM-fwd] after block 0: alloc=18.41GB peak=50.47GB
436
+ [MEM-fwd] after block 10: alloc=24.23GB peak=50.47GB
437
+ [MEM-fwd] after block 20: alloc=29.51GB peak=50.47GB
438
+ [MEM-fwd] after block 29: alloc=34.27GB peak=50.47GB
439
+ [MEM-fwd] after head+unpatchify: alloc=35.35GB peak=50.47GB
440
+ [MEM] student_update: START: alloc=17.02GB reserved=66.65GB peak=58.45GB
441
+ [MEM] student_update: after rollout: alloc=66.55GB reserved=69.49GB peak=68.33GB
442
+ [MEM] student_update: after perturb: alloc=66.57GB reserved=69.49GB peak=68.33GB
443
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=67.32GB peak=68.33GB
444
+ [MEM-fwd] after block 0: alloc=67.37GB peak=77.78GB
445
+ [MEM-fwd] after block 10: alloc=67.90GB peak=78.31GB
446
+ [MEM-fwd] after block 20: alloc=67.90GB peak=78.31GB
447
+ [MEM-fwd] after block 29: alloc=67.90GB peak=78.31GB
448
+ [MEM-fwd] after head+unpatchify: alloc=67.39GB peak=78.31GB
449
+ [MEM] student_update: after fake_score: alloc=66.65GB reserved=81.28GB peak=78.31GB
450
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.58GB peak=78.31GB
451
+ [MEM-fwd] after block 0: alloc=69.28GB peak=91.12GB
452
+ [MEM-fwd] after block 10: alloc=71.04GB peak=92.88GB
453
+ [MEM-fwd] after block 20: alloc=71.04GB peak=92.88GB
454
+ [MEM-fwd] after block 30: alloc=71.04GB peak=92.88GB
455
+ [MEM-fwd] after block 39: alloc=71.04GB peak=92.88GB
456
+ [MEM-fwd] after head+unpatchify: alloc=69.30GB peak=92.88GB
457
+ [MEM] student_update: after teacher: alloc=66.67GB reserved=96.36GB peak=92.88GB
458
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.60GB peak=92.88GB
459
+ [MEM-fwd] after block 0: alloc=69.30GB peak=92.88GB
460
+ [MEM-fwd] after block 10: alloc=71.06GB peak=92.91GB
461
+ [MEM-fwd] after block 20: alloc=71.06GB peak=92.91GB
462
+ [MEM-fwd] after block 30: alloc=71.06GB peak=92.91GB
463
+ [MEM-fwd] after block 39: alloc=71.06GB peak=92.91GB
464
+ [MEM-fwd] after head+unpatchify: alloc=69.32GB peak=92.91GB
465
+ [MEM] student_update: after CFG: alloc=66.67GB reserved=96.36GB peak=92.91GB
466
+ [MEM] student_update: after VSD loss: alloc=66.70GB reserved=96.36GB peak=92.91GB
467
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
468
+ Avg Max Min
469
+ cpu_mem_gb 38.679560 38.760101 38.606266
470
+ peak_gpu_mem_gb 86.525435 86.526208 86.525178
471
+ peak_gpu_mem_reserved_gb 92.133301 92.142578 92.105469
472
+ util 78.750000 83.000000 73.000000
473
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 10 : avg iteration time 185.69 seconds
474
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 10 : data loading time 0.00
475
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 10 : avg forward pass time 82.75
476
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 10 : backward pass time 10.03
477
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 10 : optimizer step time 0.21
478
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 10--------------------
479
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.7031 iter count: 1.0
480
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.7031 iter count: 1.0
481
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
482
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0071 iter count: 1.0
483
+ [Apr 3, 2026 - 00:16:02 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
484
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=98.90GB peak=92.91GB
485
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.26GB reserved=53.40GB peak=49.73GB
486
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.03GB peak=49.73GB
487
+ [MEM-fwd] after block 0: alloc=17.66GB peak=49.73GB
488
+ [MEM-fwd] after block 10: alloc=23.48GB peak=49.73GB
489
+ [MEM-fwd] after block 20: alloc=28.76GB peak=49.73GB
490
+ [MEM-fwd] after block 29: alloc=33.52GB peak=49.73GB
491
+ [MEM-fwd] after head+unpatchify: alloc=34.60GB peak=49.73GB
492
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=62.56GB peak=58.43GB
493
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.36GB peak=49.69GB
494
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
495
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
496
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
497
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
498
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
499
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
500
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
501
+ Avg Max Min
502
+ cpu_mem_gb 38.679598 38.760120 38.606396
503
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
504
+ peak_gpu_mem_reserved_gb 55.583008 55.802734 55.509766
505
+ util 97.250000 100.000000 91.000000
506
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 11 : avg iteration time 58.86 seconds
507
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 11 : data loading time 0.00
508
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 11 : avg forward pass time 17.51
509
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 11 : backward pass time 11.32
510
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 11 : optimizer step time 1.20
511
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 11--------------------
512
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0087 iter count: 1.0
513
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0087 iter count: 1.0
514
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
515
+ [Apr 3, 2026 - 00:17:01 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
516
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.92GB peak=58.45GB
517
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.52GB peak=48.91GB
518
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
519
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
520
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
521
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
522
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
523
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
524
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.44GB peak=57.66GB
525
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.30GB peak=49.69GB
526
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
527
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
528
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
529
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
530
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
531
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
532
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
533
+ Avg Max Min
534
+ cpu_mem_gb 38.679661 38.760372 38.606403
535
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
536
+ peak_gpu_mem_reserved_gb 55.514648 55.568359 55.353516
537
+ util 97.250000 100.000000 93.000000
538
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 12 : avg iteration time 54.55 seconds
539
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 12 : data loading time 0.00
540
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 12 : avg forward pass time 15.35
541
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 12 : backward pass time 11.34
542
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 12 : optimizer step time 1.17
543
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 12--------------------
544
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
545
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
546
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
547
+ [Apr 3, 2026 - 00:17:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
548
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.44GB peak=58.45GB
549
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.52GB peak=48.90GB
550
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.90GB
551
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.90GB
552
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.90GB
553
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.90GB
554
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.90GB
555
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.90GB
556
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.46GB peak=57.66GB
557
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.02GB peak=49.69GB
558
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
559
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
560
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
561
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
562
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
563
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
564
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
565
+ Avg Max Min
566
+ cpu_mem_gb 38.679665 38.760372 38.606419
567
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
568
+ peak_gpu_mem_reserved_gb 57.541016 58.224609 55.490234
569
+ util 97.750000 99.000000 94.000000
570
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 13 : avg iteration time 48.16 seconds
571
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 13 : data loading time 0.00
572
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 13 : avg forward pass time 12.17
573
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 13 : backward pass time 11.32
574
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 13 : optimizer step time 1.17
575
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 13--------------------
576
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
577
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
578
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
579
+ [Apr 3, 2026 - 00:18:43 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
580
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.58GB peak=58.45GB
581
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.54GB peak=48.91GB
582
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
583
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
584
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
585
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
586
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
587
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
588
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.44GB peak=57.66GB
589
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.23GB reserved=50.32GB peak=49.69GB
590
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.11GB peak=49.69GB
591
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.69GB
592
+ [MEM-fwd] after block 10: alloc=23.50GB peak=49.69GB
593
+ [MEM-fwd] after block 20: alloc=28.78GB peak=49.69GB
594
+ [MEM-fwd] after block 29: alloc=33.54GB peak=49.69GB
595
+ [MEM-fwd] after head+unpatchify: alloc=34.62GB peak=49.69GB
596
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
597
+ Avg Max Min
598
+ cpu_mem_gb 38.679850 38.761127 38.606380
599
+ peak_gpu_mem_gb 54.432534 54.433307 54.432277
600
+ peak_gpu_mem_reserved_gb 55.739258 55.744141 55.724609
601
+ util 98.000000 100.000000 94.000000
602
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 14 : avg iteration time 57.85 seconds
603
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 14 : data loading time 0.00
604
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 14 : avg forward pass time 17.00
605
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 14 : backward pass time 11.34
606
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 14 : optimizer step time 1.18
607
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 14--------------------
608
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0077 iter count: 1.0
609
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0077 iter count: 1.0
610
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
611
+ [Apr 3, 2026 - 00:19:41 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
612
+ [MEM] fake_score_update: START: alloc=15.40GB reserved=59.83GB peak=58.45GB
613
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.44GB reserved=49.50GB peak=48.91GB
614
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.32GB peak=48.91GB
615
+ [MEM-fwd] after block 0: alloc=16.90GB peak=48.91GB
616
+ [MEM-fwd] after block 10: alloc=22.71GB peak=48.91GB
617
+ [MEM-fwd] after block 20: alloc=28.00GB peak=48.91GB
618
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.91GB
619
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.91GB
620
+ [MEM] student_update: START: alloc=16.30GB reserved=59.44GB peak=57.72GB
621
+ [MEM] student_update: after rollout: alloc=65.84GB reserved=68.21GB peak=67.61GB
622
+ [MEM] student_update: after perturb: alloc=65.86GB reserved=68.21GB peak=67.61GB
623
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=66.61GB peak=67.61GB
624
+ [MEM-fwd] after block 0: alloc=66.66GB peak=77.07GB
625
+ [MEM-fwd] after block 10: alloc=67.19GB peak=77.60GB
626
+ [MEM-fwd] after block 20: alloc=67.19GB peak=77.60GB
627
+ [MEM-fwd] after block 29: alloc=67.19GB peak=77.60GB
628
+ [MEM-fwd] after head+unpatchify: alloc=66.68GB peak=77.60GB
629
+ [MEM] student_update: after fake_score: alloc=65.94GB reserved=80.02GB peak=77.60GB
630
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.86GB peak=77.60GB
631
+ [MEM-fwd] after block 0: alloc=68.57GB peak=90.41GB
632
+ [MEM-fwd] after block 10: alloc=70.33GB peak=92.17GB
633
+ [MEM-fwd] after block 20: alloc=70.33GB peak=92.17GB
634
+ [MEM-fwd] after block 30: alloc=70.33GB peak=92.17GB
635
+ [MEM-fwd] after block 39: alloc=70.33GB peak=92.17GB
636
+ [MEM-fwd] after head+unpatchify: alloc=68.59GB peak=92.17GB
637
+ [MEM] student_update: after teacher: alloc=65.96GB reserved=95.12GB peak=92.17GB
638
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.89GB peak=92.17GB
639
+ [MEM-fwd] after block 0: alloc=68.59GB peak=92.17GB
640
+ [MEM-fwd] after block 10: alloc=70.35GB peak=92.19GB
641
+ [MEM-fwd] after block 20: alloc=70.35GB peak=92.19GB
642
+ [MEM-fwd] after block 30: alloc=70.35GB peak=92.19GB
643
+ [MEM-fwd] after block 39: alloc=70.35GB peak=92.19GB
644
+ [MEM-fwd] after head+unpatchify: alloc=68.61GB peak=92.19GB
645
+ [MEM] student_update: after CFG: alloc=65.96GB reserved=95.12GB peak=92.19GB
646
+ [MEM] student_update: after VSD loss: alloc=65.98GB reserved=95.12GB peak=92.19GB
647
+ [MEM] fake_score_update: START: alloc=14.18GB reserved=98.66GB peak=92.19GB
648
+ [MEM] fake_score_update: after student gen (no_grad): alloc=17.02GB reserved=54.16GB peak=50.48GB
649
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.78GB peak=50.48GB
650
+ [MEM-fwd] after block 0: alloc=18.42GB peak=50.48GB
651
+ [MEM-fwd] after block 10: alloc=24.23GB peak=50.48GB
652
+ [MEM-fwd] after block 20: alloc=29.52GB peak=50.48GB
653
+ [MEM-fwd] after block 29: alloc=34.27GB peak=50.48GB
654
+ [MEM-fwd] after head+unpatchify: alloc=35.35GB peak=50.48GB
655
+ [MEM] student_update: START: alloc=17.03GB reserved=63.28GB peak=58.45GB
656
+ [MEM] student_update: after rollout: alloc=66.57GB reserved=69.53GB peak=68.34GB
657
+ [MEM] student_update: after perturb: alloc=66.59GB reserved=69.53GB peak=68.34GB
658
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=67.34GB peak=68.34GB
659
+ [MEM-fwd] after block 0: alloc=67.39GB peak=77.80GB
660
+ [MEM-fwd] after block 10: alloc=67.92GB peak=78.33GB
661
+ [MEM-fwd] after block 20: alloc=67.92GB peak=78.33GB
662
+ [MEM-fwd] after block 29: alloc=67.92GB peak=78.33GB
663
+ [MEM-fwd] after head+unpatchify: alloc=67.41GB peak=78.33GB
664
+ [MEM] student_update: after fake_score: alloc=66.67GB reserved=80.78GB peak=78.33GB
665
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.60GB peak=78.33GB
666
+ [MEM-fwd] after block 0: alloc=69.30GB peak=91.14GB
667
+ [MEM-fwd] after block 10: alloc=71.06GB peak=92.90GB
668
+ [MEM-fwd] after block 20: alloc=71.06GB peak=92.90GB
669
+ [MEM-fwd] after block 30: alloc=71.06GB peak=92.90GB
670
+ [MEM-fwd] after block 39: alloc=71.06GB peak=92.90GB
671
+ [MEM-fwd] after head+unpatchify: alloc=69.32GB peak=92.90GB
672
+ [MEM] student_update: after teacher: alloc=66.69GB reserved=96.38GB peak=92.90GB
673
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=68.62GB peak=92.90GB
674
+ [MEM-fwd] after block 0: alloc=69.32GB peak=92.90GB
675
+ [MEM-fwd] after block 10: alloc=71.08GB peak=92.93GB
676
+ [MEM-fwd] after block 20: alloc=71.08GB peak=92.93GB
677
+ [MEM-fwd] after block 30: alloc=71.08GB peak=92.93GB
678
+ [MEM-fwd] after block 39: alloc=71.08GB peak=92.93GB
679
+ [MEM-fwd] after head+unpatchify: alloc=69.34GB peak=92.93GB
680
+ [MEM] student_update: after CFG: alloc=66.69GB reserved=96.42GB peak=92.93GB
681
+ [MEM] student_update: after VSD loss: alloc=66.71GB reserved=96.42GB peak=92.93GB
682
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
683
+ Avg Max Min
684
+ cpu_mem_gb 38.679507 38.759026 38.606628
685
+ peak_gpu_mem_gb 86.542802 86.543575 86.542545
686
+ peak_gpu_mem_reserved_gb 92.208496 92.222656 92.166016
687
+ util 77.500000 82.000000 71.000000
688
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 15 : avg iteration time 182.57 seconds
689
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 15 : data loading time 0.00
690
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 15 : avg forward pass time 81.19
691
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 15 : backward pass time 10.03
692
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 15 : optimizer step time 0.22
693
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 15--------------------
694
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.6475 iter count: 1.0
695
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_vsd_loss: 0.6475 iter count: 1.0
696
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_gen: 0.0000 iter count: 1.0
697
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0067 iter count: 1.0
698
+ [Apr 3, 2026 - 00:22:44 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
699
+ [MEM] fake_score_update: START: alloc=13.44GB reserved=98.96GB peak=92.93GB
700
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.27GB reserved=53.40GB peak=49.73GB
701
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.04GB peak=49.73GB
702
+ [MEM-fwd] after block 0: alloc=17.67GB peak=49.73GB
703
+ [MEM-fwd] after block 10: alloc=23.49GB peak=49.73GB
704
+ [MEM-fwd] after block 20: alloc=28.77GB peak=49.73GB
705
+ [MEM-fwd] after block 29: alloc=33.53GB peak=49.73GB
706
+ [MEM-fwd] after head+unpatchify: alloc=34.61GB peak=49.73GB
707
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=62.56GB peak=58.43GB
708
+ [MEM] fake_score_update: after student gen (no_grad): alloc=16.24GB reserved=50.28GB peak=49.70GB
709
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=17.12GB peak=49.70GB
710
+ [MEM-fwd] after block 0: alloc=17.69GB peak=49.70GB
711
+ [MEM-fwd] after block 10: alloc=23.51GB peak=49.70GB
712
+ [MEM-fwd] after block 20: alloc=28.79GB peak=49.70GB
713
+ [MEM-fwd] after block 29: alloc=33.55GB peak=49.70GB
714
+ [MEM-fwd] after head+unpatchify: alloc=34.63GB peak=49.70GB
715
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
716
+ Avg Max Min
717
+ cpu_mem_gb 38.679513 38.759029 38.606636
718
+ peak_gpu_mem_gb 54.440308 54.441080 54.440050
719
+ peak_gpu_mem_reserved_gb 57.801758 58.539062 55.589844
720
+ util 98.500000 100.000000 95.000000
721
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 16 : avg iteration time 58.84 seconds
722
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 16 : data loading time 0.00
723
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 16 : avg forward pass time 17.46
724
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 16 : backward pass time 11.37
725
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 16 : optimizer step time 1.18
726
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 16--------------------
727
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0066 iter count: 1.0
728
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0066 iter count: 1.0
729
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
730
+ [Apr 3, 2026 - 00:23:43 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
731
+ [MEM] fake_score_update: START: alloc=15.41GB reserved=59.69GB peak=58.46GB
732
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.45GB reserved=49.55GB peak=48.92GB
733
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.33GB peak=48.92GB
734
+ [MEM-fwd] after block 0: alloc=16.91GB peak=48.92GB
735
+ [MEM-fwd] after block 10: alloc=22.72GB peak=48.92GB
736
+ [MEM-fwd] after block 20: alloc=28.01GB peak=48.92GB
737
+ [MEM-fwd] after block 29: alloc=32.76GB peak=48.92GB
738
+ [MEM-fwd] after head+unpatchify: alloc=33.84GB peak=48.92GB
739
+ [MEM] fake_score_update: START: alloc=16.19GB reserved=59.46GB peak=57.67GB
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/requirements.txt ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastgen==0.1.0
2
+ nvitop==1.6.1
3
+ ftfy==6.3.1
4
+ braceexpand==0.1.7
5
+ antlr4-python3-runtime==4.9.3
6
+ webdataset==1.0.2
7
+ sentry-sdk==2.53.0
8
+ rdkit==2025.9.5
9
+ python-dotenv==1.2.1
10
+ proglog==0.1.12
11
+ omegaconf==2.3.0
12
+ narwhals==2.17.0
13
+ loguru==0.7.3
14
+ imageio-ffmpeg==0.6.0
15
+ plotly==6.5.2
16
+ moviepy==2.2.1
17
+ hydra-core==1.3.2
18
+ wandb==0.25.0
19
+ fastgen==0.1.0
20
+ packaging==25.0
21
+ setuptools==80.10.2
22
+ wheel==0.46.3
23
+ pip==26.0.1
24
+ webencodings==0.5.1
25
+ pure_eval==0.2.3
26
+ ptyprocess==0.7.0
27
+ nvidia-ml-py==13.590.48
28
+ nvidia-cusparselt-cu12==0.7.1
29
+ mpmath==1.3.0
30
+ fastjsonschema==2.21.2
31
+ zipp==3.23.0
32
+ xyzservices==2025.11.0
33
+ widgetsnbextension==4.0.15
34
+ websocket-client==1.9.0
35
+ webcolors==25.10.0
36
+ wcwidth==0.6.0
37
+ urllib3==2.6.3
38
+ uri-template==1.3.0
39
+ tzdata==2025.3
40
+ typing_extensions==4.15.0
41
+ triton==3.6.0
42
+ traitlets==5.14.3
43
+ tqdm==4.67.3
44
+ tornado==6.5.5
45
+ tinycss2==1.4.0
46
+ sympy==1.14.0
47
+ soupsieve==2.8.3
48
+ smmap==5.0.3
49
+ six==1.16.0
50
+ sentencepiece==0.2.1
51
+ Send2Trash==2.1.0
52
+ safetensors==0.7.0
53
+ rpds-py==0.30.0
54
+ rfc3986-validator==0.1.1
55
+ regex==2026.2.28
56
+ pyzmq==27.1.0
57
+ PyYAML==6.0.3
58
+ python-json-logger==4.0.0
59
+ Pygments==2.19.2
60
+ pycparser==3.0
61
+ psutil==7.2.2
62
+ protobuf==4.24.4
63
+ prometheus_client==0.24.1
64
+ platformdirs==4.9.4
65
+ pillow==11.3.0
66
+ pexpect==4.9.0
67
+ parso==0.8.6
68
+ pandocfilters==1.5.1
69
+ nvidia-nvtx-cu12==12.8.90
70
+ nvidia-nvshmem-cu12==3.4.5
71
+ nvidia-nvjitlink-cu12==12.8.93
72
+ nvidia-nccl-cu12==2.27.5
73
+ nvidia-curand-cu12==10.3.9.90
74
+ nvidia-cufile-cu12==1.13.1.3
75
+ nvidia-cuda-runtime-cu12==12.8.90
76
+ nvidia-cuda-nvrtc-cu12==12.8.93
77
+ nvidia-cuda-cupti-cu12==12.8.90
78
+ nvidia-cublas-cu12==12.8.4.1
79
+ numpy==1.26.4
80
+ networkx==3.6.1
81
+ nest-asyncio==1.6.0
82
+ mistune==3.2.0
83
+ MarkupSafe==3.0.3
84
+ lark==1.3.1
85
+ jupyterlab_widgets==3.0.16
86
+ jupyterlab_pygments==0.3.0
87
+ jsonpointer==3.0.0
88
+ json5==0.13.0
89
+ jmespath==1.1.0
90
+ idna==3.11
91
+ hf-xet==1.4.2
92
+ h11==0.16.0
93
+ fsspec==2026.2.0
94
+ fqdn==1.5.1
95
+ filelock==3.25.2
96
+ executing==2.2.1
97
+ einops==0.8.2
98
+ defusedxml==0.7.1
99
+ decorator==5.2.1
100
+ debugpy==1.8.20
101
+ cuda-pathfinder==1.4.2
102
+ comm==0.2.3
103
+ click==8.3.1
104
+ charset-normalizer==3.4.5
105
+ certifi==2026.2.25
106
+ bleach==6.3.0
107
+ babel==2.18.0
108
+ av==17.0.0
109
+ attrs==25.4.0
110
+ async-lru==2.2.0
111
+ asttokens==3.0.1
112
+ annotated-types==0.7.0
113
+ typing-inspection==0.4.2
114
+ terminado==0.18.1
115
+ stack-data==0.6.3
116
+ scipy==1.17.1
117
+ rfc3987-syntax==1.1.0
118
+ rfc3339-validator==0.1.4
119
+ requests==2.32.5
120
+ referencing==0.37.0
121
+ python-dateutil==2.9.0.post0
122
+ pydantic_core==2.41.5
123
+ prompt_toolkit==3.0.52
124
+ opencv-python-headless==4.11.0.86
125
+ nvidia-cusparse-cu12==12.5.8.93
126
+ nvidia-cufft-cu12==11.3.3.83
127
+ nvidia-cudnn-cu12==9.10.2.21
128
+ matplotlib-inline==0.2.1
129
+ jupyter_core==5.9.1
130
+ Jinja2==3.1.6
131
+ jedi==0.19.2
132
+ ipython_pygments_lexers==1.1.1
133
+ importlib_metadata==8.7.1
134
+ ImageIO==2.37.3
135
+ httpcore==1.0.9
136
+ gitdb==4.0.12
137
+ cuda-bindings==12.9.4
138
+ contourpy==1.3.3
139
+ cffi==2.0.0
140
+ beautifulsoup4==4.14.3
141
+ anyio==4.12.1
142
+ soundfile==0.13.1
143
+ pydantic==2.12.5
144
+ nvidia-cusolver-cu12==11.7.3.90
145
+ jupyter_server_terminals==0.5.4
146
+ jupyter_client==8.8.0
147
+ jsonschema-specifications==2025.9.1
148
+ ipython==9.11.0
149
+ httpx==0.28.1
150
+ GitPython==3.1.46
151
+ botocore==1.42.68
152
+ bokeh==3.9.0
153
+ arrow==1.4.0
154
+ argon2-cffi-bindings==25.1.0
155
+ torch==2.10.0
156
+ s3transfer==0.16.0
157
+ jsonschema==4.26.0
158
+ isoduration==20.11.0
159
+ ipywidgets==8.1.8
160
+ ipykernel==7.2.0
161
+ argon2-cffi==25.1.0
162
+ torchvision==0.25.0
163
+ nbformat==5.10.4
164
+ jupyter-console==6.6.3
165
+ boto3==1.42.68
166
+ accelerate==1.13.0
167
+ nbclient==0.10.4
168
+ jupyter-events==0.12.0
169
+ nbconvert==7.17.0
170
+ jupyter_server==2.17.0
171
+ notebook_shim==0.2.4
172
+ jupyterlab_server==2.28.0
173
+ jupyter-lsp==2.3.0
174
+ jupyterlab==4.5.6
175
+ notebook==7.5.5
176
+ jupyter==1.1.1
177
+ fastgen==0.1.0
178
+ pandas==3.0.1
179
+ shellingham==1.5.4
180
+ mdurl==0.1.2
181
+ annotated-doc==0.0.4
182
+ markdown-it-py==4.0.0
183
+ rich==14.3.3
184
+ typer==0.24.1
185
+ huggingface_hub==1.7.1
186
+ timm==1.0.25
187
+ tokenizers==0.22.2
188
+ diffusers==0.37.0
189
+ transformers==5.3.0
190
+ peft==0.18.1
191
+ easydict==1.13
192
+ lmdb==2.2.0
193
+ threadpoolctl==3.6.0
194
+ soxr==1.0.0
195
+ msgpack==1.1.2
196
+ llvmlite==0.47.0
197
+ lazy-loader==0.5
198
+ joblib==1.5.3
199
+ audioread==3.1.0
200
+ scikit-learn==1.8.0
201
+ pooch==1.9.0
202
+ numba==0.65.0
203
+ librosa==0.11.0
204
+ simsimd==6.5.16
205
+ flatbuffers==25.12.19
206
+ tifffile==2026.3.3
207
+ stringzilla==4.6.0
208
+ pyparsing==3.3.2
209
+ prettytable==3.17.0
210
+ onnx==1.17.0
211
+ kiwisolver==1.5.0
212
+ fonttools==4.62.1
213
+ Cython==3.2.4
214
+ cycler==0.12.1
215
+ scikit-image==0.26.0
216
+ onnxruntime==1.24.4
217
+ matplotlib==3.10.8
218
+ albucore==0.0.24
219
+ albumentations==2.0.8
220
+ insightface==0.7.3
221
+ kornia_rs==0.1.10
222
+ kornia==0.8.2
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/files/wandb-metadata.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.12",
4
+ "startedAt": "2026-04-02T15:01:32.168210Z",
5
+ "args": [
6
+ "--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
7
+ "-",
8
+ "trainer.resume=False",
9
+ "log_config.name=sf_4gpu_bs8_lr2e6_5000iter_shift5_combined",
10
+ "log_config.project=OmniAvatar-FastGen"
11
+ ],
12
+ "program": "/home/work/.local/hyunbin/FastGen/train.py",
13
+ "codePath": "train.py",
14
+ "codePathLocal": "train.py",
15
+ "git": {
16
+ "remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
17
+ "commit": "04de80beaf50f849c12a55a5d8358d94530b7bb5"
18
+ },
19
+ "email": "paul.hyunbin@gmail.com",
20
+ "root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined",
21
+ "host": "main1",
22
+ "executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
23
+ "cpu_count": 112,
24
+ "cpu_count_logical": 224,
25
+ "gpu": "NVIDIA H200",
26
+ "gpu_count": 4,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1356758433792",
30
+ "used": "270456766464"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "2163961778176"
35
+ },
36
+ "gpu_nvidia": [
37
+ {
38
+ "name": "NVIDIA H200",
39
+ "memoryTotal": "150754820096",
40
+ "cudaCores": 16896,
41
+ "architecture": "Hopper",
42
+ "uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
43
+ },
44
+ {
45
+ "name": "NVIDIA H200",
46
+ "memoryTotal": "150754820096",
47
+ "cudaCores": 16896,
48
+ "architecture": "Hopper",
49
+ "uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
50
+ },
51
+ {
52
+ "name": "NVIDIA H200",
53
+ "memoryTotal": "150754820096",
54
+ "cudaCores": 16896,
55
+ "architecture": "Hopper",
56
+ "uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
57
+ },
58
+ {
59
+ "name": "NVIDIA H200",
60
+ "memoryTotal": "150754820096",
61
+ "cudaCores": 16896,
62
+ "architecture": "Hopper",
63
+ "uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
64
+ }
65
+ ],
66
+ "cudaVersion": "12.9",
67
+ "writerId": "wykcz6se3w95mxueg1dbfpdz1rkcn7vb"
68
+ }
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-core.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-03T00:01:32.229543031+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpprztt27o/port-1047116.txt","pid":1047116,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-03T00:01:32.230033289+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1047116}
3
+ {"time":"2026-04-03T00:01:32.230022361+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1047116-1057160-4156700530/socket","Net":"unix"}}
4
+ {"time":"2026-04-03T00:01:32.416224439+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-03T00:01:32.430551403+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"nkf4iovm","id":"1(@)"}
6
+ {"time":"2026-04-03T00:01:32.80024046+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"nkf4iovm","id":"1(@)"}
7
+ {"time":"2026-04-03T00:01:38.985900014+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"rtqkkeh0iczo"}
8
+ {"time":"2026-04-03T00:24:18.430668942+09:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-04-03T00:01:32.430639522+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-03T00:01:32.800089951+09:00","level":"INFO","msg":"stream: created new stream","id":"nkf4iovm"}
3
+ {"time":"2026-04-03T00:01:32.800139938+09:00","level":"INFO","msg":"handler: started","stream_id":"nkf4iovm"}
4
+ {"time":"2026-04-03T00:01:32.800233729+09:00","level":"INFO","msg":"stream: started","id":"nkf4iovm"}
5
+ {"time":"2026-04-03T00:01:32.80025365+09:00","level":"INFO","msg":"sender: started","stream_id":"nkf4iovm"}
6
+ {"time":"2026-04-03T00:01:32.800252986+09:00","level":"INFO","msg":"writer: started","stream_id":"nkf4iovm"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Configure stats pid to 1047116
3
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-03 00:01:32,169 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug.log
5
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/logs/debug-internal.log
6
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_4gpu_bs8_lr2e6_5000iter_shift5_combined', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '5000', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7f328019bce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7f32803f2660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-03 00:01:32,170 INFO MainThread:1047116 [wandb_init.py:init():892] starting backend
10
+ 2026-04-03 00:01:32,416 INFO MainThread:1047116 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-03 00:01:32,428 INFO MainThread:1047116 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-03 00:01:32,432 INFO MainThread:1047116 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-03 00:01:32,448 INFO MainThread:1047116 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-03 00:01:33,668 INFO MainThread:1047116 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-03 00:01:33,982 INFO MainThread:1047116 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-03 00:01:33,986 INFO MainThread:1047116 [wandb_init.py:init():1082] run started, returning control to user process
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb/run-20260403_000132-nkf4iovm/run-nkf4iovm.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b0ff79a5dafcc07f84b00216e9f32cb24f03c12ace3669ef628f85ac889c1e
3
+ size 360448
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined/wandb_id.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nkf4iovm
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7fb9c8423a60>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: online
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7fb9c867e660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7fb9c867e660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7fb9c867e660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_4gpu_bs8_lr2e6_5000iter_shift5_combined_v2/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '5000'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7f7c52b8fce0>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_combined_debug
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: disabled
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7f7c52bf2660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7f7c52bf2660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7f7c52bf2660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '10'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_debug/wandb_id.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ k4ws77lt
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7f99973a7ce0>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_combined_step_test
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: online
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7f99975fe660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7f99975fe660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7f99975fe660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '20'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T22:45:31.396833745+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-02T22:45:31.746696076+09:00","level":"INFO","msg":"stream: created new stream","id":"spcd04xe"}
3
+ {"time":"2026-04-02T22:45:31.746744779+09:00","level":"INFO","msg":"handler: started","stream_id":"spcd04xe"}
4
+ {"time":"2026-04-02T22:45:31.746822827+09:00","level":"INFO","msg":"stream: started","id":"spcd04xe"}
5
+ {"time":"2026-04-02T22:45:31.746841154+09:00","level":"INFO","msg":"sender: started","stream_id":"spcd04xe"}
6
+ {"time":"2026-04-02T22:45:31.74684523+09:00","level":"INFO","msg":"writer: started","stream_id":"spcd04xe"}
7
+ {"time":"2026-04-02T22:52:57.91103952+09:00","level":"INFO","msg":"stream: closing","id":"spcd04xe"}
8
+ {"time":"2026-04-02T22:52:58.486184439+09:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-04-02T22:52:58.869727008+09:00","level":"INFO","msg":"handler: closed","stream_id":"spcd04xe"}
10
+ {"time":"2026-04-02T22:52:58.869816672+09:00","level":"INFO","msg":"sender: closed","stream_id":"spcd04xe"}
11
+ {"time":"2026-04-02T22:52:58.869827326+09:00","level":"INFO","msg":"stream: closed","id":"spcd04xe"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/debug.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Configure stats pid to 792541
3
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log
5
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log
6
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_combined_step_test', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '20', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7ff2f56a3ce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():892] starting backend
10
+ 2026-04-02 22:45:31,381 INFO MainThread:792541 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-02 22:45:31,394 INFO MainThread:792541 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-02 22:45:31,398 INFO MainThread:792541 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-02 22:45:31,413 INFO MainThread:792541 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-02 22:45:32,719 INFO MainThread:792541 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-02 22:45:32,998 INFO MainThread:792541 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [service_client.py:_forward_responses():134] Reached EOF.
21
+ 2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/config.yaml ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.25.0
4
+ e:
5
+ mf7qlulwtbceq6fkw52thj4sgf25e3dz:
6
+ args:
7
+ - --config=fastgen/configs/experiments/OmniAvatar/config_sf.py
8
+ - '-'
9
+ - trainer.resume=False
10
+ - log_config.name=sf_combined_step_test
11
+ - log_config.project=OmniAvatar-FastGen
12
+ - trainer.max_iter=20
13
+ codePath: train.py
14
+ codePathLocal: train.py
15
+ cpu_count: 112
16
+ cpu_count_logical: 224
17
+ cudaVersion: "12.9"
18
+ disk:
19
+ /:
20
+ total: "1356758433792"
21
+ used: "257963536384"
22
+ email: paul.hyunbin@gmail.com
23
+ executable: /home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12
24
+ git:
25
+ commit: 04de80beaf50f849c12a55a5d8358d94530b7bb5
26
+ remote: https://paulcho98:@github.com/paulcho98/FastGen.git
27
+ gpu: NVIDIA H200
28
+ gpu_count: 4
29
+ gpu_nvidia:
30
+ - architecture: Hopper
31
+ cudaCores: 16896
32
+ memoryTotal: "150754820096"
33
+ name: NVIDIA H200
34
+ uuid: GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c
35
+ - architecture: Hopper
36
+ cudaCores: 16896
37
+ memoryTotal: "150754820096"
38
+ name: NVIDIA H200
39
+ uuid: GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262
40
+ - architecture: Hopper
41
+ cudaCores: 16896
42
+ memoryTotal: "150754820096"
43
+ name: NVIDIA H200
44
+ uuid: GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790
45
+ - architecture: Hopper
46
+ cudaCores: 16896
47
+ memoryTotal: "150754820096"
48
+ name: NVIDIA H200
49
+ uuid: GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0
50
+ host: main1
51
+ memory:
52
+ total: "2163961778176"
53
+ os: Linux-5.15.0-151-generic-x86_64-with-glibc2.39
54
+ program: /home/work/.local/hyunbin/FastGen/train.py
55
+ python: CPython 3.12.12
56
+ root: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test
57
+ startedAt: "2026-04-02T13:45:31.145420Z"
58
+ writerId: mf7qlulwtbceq6fkw52thj4sgf25e3dz
59
+ m: []
60
+ python_version: 3.12.12
61
+ t:
62
+ "1":
63
+ - 1
64
+ - 5
65
+ - 11
66
+ - 41
67
+ - 49
68
+ - 50
69
+ - 53
70
+ - 63
71
+ - 71
72
+ - 83
73
+ - 98
74
+ "2":
75
+ - 1
76
+ - 5
77
+ - 11
78
+ - 41
79
+ - 49
80
+ - 50
81
+ - 53
82
+ - 63
83
+ - 71
84
+ - 83
85
+ - 98
86
+ "3":
87
+ - 13
88
+ - 14
89
+ - 16
90
+ - 61
91
+ - 62
92
+ "4": 3.12.12
93
+ "5": 0.25.0
94
+ "6": 5.3.0
95
+ "12": 0.25.0
96
+ "13": linux-x86_64
97
+ dataloader_train:
98
+ value:
99
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
100
+ batch_size: "8"
101
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
102
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
103
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
104
+ num_workers: "2"
105
+ use_ref_sequence: "True"
106
+ dataloader_val:
107
+ value:
108
+ _target_: <function create_omniavatar_dataloader at 0x7ff2f56a3ce0>
109
+ batch_size: "1"
110
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
111
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
112
+ load_ode_path: "False"
113
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
114
+ num_workers: "2"
115
+ use_ref_sequence: "True"
116
+ eval:
117
+ value:
118
+ max_ckpt: "100000000"
119
+ min_ckpt: "0"
120
+ num_samples: "50000"
121
+ samples_dir: samples
122
+ save_images: "False"
123
+ log_config:
124
+ value:
125
+ group: omniavatar_sf
126
+ name: sf_combined_step_test
127
+ project: OmniAvatar-FastGen
128
+ wandb_credential: ./credentials/wandb_api.txt
129
+ wandb_entity: paulhcho
130
+ wandb_mode: online
131
+ model:
132
+ value:
133
+ add_teacher_to_fsdp_dict: "True"
134
+ context_noise: "0.0"
135
+ ddp_find_unused_parameters: "True"
136
+ device: cuda
137
+ discriminator:
138
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
139
+ all_res:
140
+ - "32"
141
+ - "16"
142
+ - "8"
143
+ feature_indices: '{0, 1, 2}'
144
+ in_channels: "256"
145
+ discriminator_optimizer:
146
+ _target_: <function get_optimizer at 0x7ff2f5916660>
147
+ betas:
148
+ - "0.9"
149
+ - "0.999"
150
+ eps: "1e-08"
151
+ fused: "False"
152
+ lr: "0.0001"
153
+ model: null
154
+ optim_type: adamw
155
+ weight_decay: "0.01"
156
+ discriminator_scheduler:
157
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
158
+ cycle_lengths:
159
+ - "10000000000"
160
+ f_max:
161
+ - "1.0"
162
+ f_min:
163
+ - "1.0"
164
+ f_start:
165
+ - "1e-06"
166
+ warm_up_steps:
167
+ - "0"
168
+ enable_gradient_in_rollout: "True"
169
+ enable_preprocessors: "True"
170
+ fake_score: null
171
+ fake_score_net:
172
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
173
+ audio_hidden_size: "32"
174
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
175
+ in_dim: "65"
176
+ merge_lora: "False"
177
+ mode: v2v
178
+ model_size: 1.3B
179
+ net_pred_type: flow
180
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
181
+ schedule_type: rf
182
+ use_audio: "True"
183
+ fake_score_optimizer:
184
+ _target_: <function get_optimizer at 0x7ff2f5916660>
185
+ betas:
186
+ - "0.0"
187
+ - "0.999"
188
+ eps: "1e-08"
189
+ fused: "False"
190
+ lr: "2e-06"
191
+ model: null
192
+ optim_type: adamw
193
+ weight_decay: "0.01"
194
+ fake_score_pred_type: x0
195
+ fake_score_scheduler:
196
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
197
+ cycle_lengths:
198
+ - "10000000000"
199
+ f_max:
200
+ - "1.0"
201
+ f_min:
202
+ - "1.0"
203
+ f_start:
204
+ - "1e-06"
205
+ warm_up_steps:
206
+ - "0"
207
+ fsdp_meta_init: "False"
208
+ gan_loss_weight_gen: "0"
209
+ gan_r1_reg_alpha: "0.1"
210
+ gan_r1_reg_weight: "0.0"
211
+ gan_use_same_t_noise: "False"
212
+ grad_scaler_enabled: "False"
213
+ grad_scaler_growth_interval: "2000"
214
+ grad_scaler_init_scale: "65536.0"
215
+ guidance_scale: "4.5"
216
+ input_shape:
217
+ - "16"
218
+ - "21"
219
+ - "64"
220
+ - "64"
221
+ last_step_only: "False"
222
+ load_student_weights: "False"
223
+ net:
224
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
225
+ audio_hidden_size: "32"
226
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
227
+ chunk_size: "3"
228
+ in_dim: "65"
229
+ mode: v2v
230
+ model_size: 1.3B
231
+ net_pred_type: flow
232
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
233
+ schedule_type: rf
234
+ total_num_frames: "21"
235
+ use_audio: "True"
236
+ net_optimizer:
237
+ _target_: <function get_optimizer at 0x7ff2f5916660>
238
+ betas:
239
+ - "0.0"
240
+ - "0.999"
241
+ eps: "1e-08"
242
+ fused: "False"
243
+ lr: "2e-06"
244
+ model: null
245
+ optim_type: adamw
246
+ weight_decay: "0.01"
247
+ net_scheduler:
248
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
249
+ cycle_lengths:
250
+ - "10000000000"
251
+ f_max:
252
+ - "1.0"
253
+ f_min:
254
+ - "1.0"
255
+ f_start:
256
+ - "1e-06"
257
+ warm_up_steps:
258
+ - "0"
259
+ precision: bfloat16
260
+ precision_amp: null
261
+ precision_amp_enc: null
262
+ precision_amp_infer: null
263
+ precision_fsdp: bfloat16
264
+ pretrained_model_path: ""
265
+ pretrained_student_net_path: ""
266
+ same_step_across_blocks: "True"
267
+ sample_t_cfg:
268
+ log_t_df: "0.01"
269
+ max_t: "0.999"
270
+ min_t: "0.001"
271
+ shift: "5.0"
272
+ t_list:
273
+ - "0.999"
274
+ - "0.937"
275
+ - "0.833"
276
+ - "0.624"
277
+ - "0.0"
278
+ time_dist_type: shifted
279
+ train_p_mean: "-1.1"
280
+ train_p_std: "2.0"
281
+ skip_layers: null
282
+ start_gradient_frame: "0"
283
+ student_sample_steps: "4"
284
+ student_sample_type: sde
285
+ student_update_freq: "5"
286
+ teacher:
287
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
288
+ audio_hidden_size: "32"
289
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
290
+ in_dim: "65"
291
+ merge_lora: "True"
292
+ mode: v2v
293
+ model_size: 14B
294
+ net_pred_type: flow
295
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
296
+ schedule_type: rf
297
+ use_audio: "True"
298
+ use_ema: "False"
299
+ model_class:
300
+ value:
301
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
302
+ config: null
303
+ trainer:
304
+ value:
305
+ augment_pipe: null
306
+ batch_size_global: null
307
+ callbacks:
308
+ ema:
309
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
310
+ beta: "0.9999"
311
+ ema_halflife_kimg: "500"
312
+ ema_rampup_ratio: "0.05"
313
+ gamma: "16.97"
314
+ start_iter: "0"
315
+ type: constant
316
+ gpu_stats:
317
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
318
+ every_n: "100"
319
+ grad_clip:
320
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
321
+ grad_norm: "10.0"
322
+ model_key: net
323
+ param_count:
324
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
325
+ train_profiler:
326
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
327
+ every_n: "100"
328
+ wandb:
329
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
330
+ fps: "25"
331
+ sample_logging_iter: "100"
332
+ checkpointer:
333
+ pretrained_ckpt_key_map:
334
+ net: net
335
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
336
+ s3_container: s3://checkpoints/fastgen
337
+ s3_credential: ./credentials/s3.json
338
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints
339
+ use_s3: "False"
340
+ cudnn:
341
+ benchmark: "True"
342
+ deterministic: "False"
343
+ ddp: "False"
344
+ fsdp: "True"
345
+ fsdp_cpu_offload: "False"
346
+ fsdp_min_num_params: "10000000"
347
+ fsdp_sharding_group_size: null
348
+ global_vars: null
349
+ global_vars_val:
350
+ - null
351
+ grad_accum_rounds: "2"
352
+ logging_iter: "1"
353
+ max_iter: "20"
354
+ offload_module_in_decoding: "False"
355
+ resume: "False"
356
+ save_ckpt_iter: "100"
357
+ seed: "0"
358
+ skip_initial_validation: "True"
359
+ tf32_enabled: "True"
360
+ val_seed: null
361
+ validation_iter: "100"
362
+ visualize_teacher: "False"
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/output.log ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Apr 2, 2026 - 22:45:32 | INFO | fastgen.callbacks.wandb:init_wandb:259 ] Wandb code upload disabled (set WANDB_UPLOAD_CODE=true to enable)
2
+ [Apr 2, 2026 - 22:45:32 | SUCCESS | fastgen.trainer:__init__:53 ] Callbacks initialized successfully
3
+ [Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:__init__:57 ] Callback synchronization complete
4
+ [Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:__init__:60 ] Initializing checkpointer...
5
+ [Apr 2, 2026 - 22:45:33 | SUCCESS | fastgen.trainer:__init__:65 ] Checkpointer initialized successfully
6
+ [Apr 2, 2026 - 22:45:33 | SUCCESS | __main__:main:33 ] Trainer initialized successfully
7
+ [Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:run:77 ] Starting training
8
+ [Apr 2, 2026 - 22:45:33 | INFO | fastgen.trainer:run:80 ] Initializing callbacks and model ...
9
+ [Apr 2, 2026 - 22:45:33 | INFO | fastgen.utils.checkpointer:load:151 ] Loading model from /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
10
+ [Apr 2, 2026 - 22:45:37 | INFO | fastgen.utils.checkpointer:load:154 ] Loading the model_dict...
11
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.checkpointer:load:159 ] Model net, loading info: <All keys matched successfully>
12
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:load_pretrained_ckpt:252 ] Loaded net model from net in /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth at iteration 5000
13
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:load_pretrained_ckpt:262 ] Setting resume_iter for model to 5000.
14
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:95 ] Starting model.on_train_begin ...
15
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.methods.model:on_train_begin:296 ] Teacher check: add_teacher_to_fsdp_dict=True, fsdp_dict keys=['net', 'fake_score', 'teacher'], teacher in fsdp_dict=True
16
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:99 ] model.on_train_begin completed
17
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.trainer:run:110 ] Wrapping model into fsdp ..
18
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:102 ] Fully sharding model with 4 ranks...
19
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'net' (1.42B params)...
20
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.networks.OmniAvatar.network_causal:fully_shard:1950 ] CausalOmniAvatarWan: keeping manual gradient checkpointing (not using apply_fsdp_checkpointing due to KV cache dynamics)
21
+ [Apr 2, 2026 - 22:45:49 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
22
+ [Apr 2, 2026 - 22:45:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped net in 1.2s
23
+ [Apr 2, 2026 - 22:45:50 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'fake_score' (1.60B params)...
24
+ [Apr 2, 2026 - 22:45:50 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
25
+ [Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
26
+ [Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped fake_score in 1.5s
27
+ [Apr 2, 2026 - 22:45:51 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:132 ] Starting FSDP2 wrap for 'teacher' (14.29B params)...
28
+ [Apr 2, 2026 - 22:45:51 | INFO | fastgen.networks.OmniAvatar.network:fully_shard:765 ] OmniAvatarWan: keeping manual gradient checkpointing (checkpoint_wrapper incompatible with inter-block audio injection)
29
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:177 ] Completed sharding
30
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.distributed.fsdp:model_to_fsdp:214 ] FSDP2 wrapped teacher in 11.0s
31
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:118 ] FSDP wrapping completed
32
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.callbacks.ema:on_model_init_end:64 ] EMA ema is not enabled, skipping callback.
33
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:133 ] Auto-Resume Details: None
34
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.utils.basic_utils:set_random_seed:144 ] Using random seed 0.
35
+ [Apr 2, 2026 - 22:46:02 | INFO | fastgen.trainer:run:165 ] Instantiating dataloader...
36
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.wandb:on_dataloader_init_end:361 ] SKIP_GT_VAL_UPLOAD=1 — skipping GT val video upload
37
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.gpu_stats:on_train_begin:57 ] every_n to measure gpus stats: 1
38
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.train_profiler:on_train_begin:54 ] every_n to profile trainer: 1
39
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] model (OmniAvatarSelfForcingModel) has 1596.36 M trainable and 17311.83 M total params (logical).
40
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.42 M trainable and 4350.43 M total params LOCAL on rank 0.
41
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 1.
42
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 2.
43
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] model (OmniAvatarSelfForcingModel) has 420.14 M trainable and 4348.63 M total params LOCAL on rank 3.
44
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] net (CausalOmniAvatarWan) has 1421.38 M trainable and 1421.38 M total params (logical).
45
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.68 M trainable and 376.68 M total params LOCAL on rank 0.
46
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 1.
47
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 2.
48
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] net (CausalOmniAvatarWan) has 376.40 M trainable and 376.40 M total params LOCAL on rank 3.
49
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:79 ] fake_score (OmniAvatarWan) has 174.98 M trainable and 1596.36 M total params (logical).
50
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.30 M total params LOCAL on rank 0.
51
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 1.
52
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 2.
53
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.callbacks.param_count:on_train_begin:86 ] fake_score (OmniAvatarWan) has 43.75 M trainable and 399.02 M total params LOCAL on rank 3.
54
+ [Apr 2, 2026 - 22:46:08 | INFO | fastgen.trainer:run:174 ] iter_start: 0
55
+ [MEM] fake_score_update: START: alloc=9.45GB reserved=9.88GB peak=9.60GB
56
+ [MEM] fake_score_update: after student gen (no_grad): alloc=12.28GB reserved=49.39GB peak=45.74GB
57
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=13.16GB peak=45.74GB
58
+ [MEM-fwd] after block 0: alloc=13.74GB peak=45.74GB
59
+ [MEM-fwd] after block 10: alloc=19.55GB peak=45.74GB
60
+ [MEM-fwd] after block 20: alloc=24.84GB peak=45.74GB
61
+ [MEM-fwd] after block 29: alloc=29.59GB peak=45.74GB
62
+ [MEM-fwd] after head+unpatchify: alloc=30.67GB peak=45.74GB
63
+ [MEM] fake_score_update: START: alloc=13.25GB reserved=15.24GB peak=54.53GB
64
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.29GB reserved=47.38GB peak=46.75GB
65
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.17GB peak=46.75GB
66
+ [MEM-fwd] after block 0: alloc=14.75GB peak=46.75GB
67
+ [MEM-fwd] after block 10: alloc=20.56GB peak=46.75GB
68
+ [MEM-fwd] after block 20: alloc=25.85GB peak=46.75GB
69
+ [MEM-fwd] after block 29: alloc=30.61GB peak=46.75GB
70
+ [MEM-fwd] after head+unpatchify: alloc=31.69GB peak=46.75GB
71
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
72
+ Avg Max Min
73
+ cpu_mem_gb 38.492574 38.558720 38.459610
74
+ peak_gpu_mem_gb 51.700073 51.700073 51.700073
75
+ peak_gpu_mem_reserved_gb 53.640625 53.640625 53.640625
76
+ util 90.500000 97.000000 86.000000
77
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 1 : data loading time 0.81
78
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 1 : avg forward pass time 15.15
79
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 1 : backward pass time 11.36
80
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 1 : optimizer step time 1.28
81
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 1--------------------
82
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0068 iter count: 1.0
83
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0068 iter count: 1.0
84
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
85
+ [Apr 2, 2026 - 22:47:05 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
86
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=57.60GB peak=55.51GB
87
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.47GB peak=46.93GB
88
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
89
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
90
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
91
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
92
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
93
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
94
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.37GB peak=55.69GB
95
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.25GB peak=47.71GB
96
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.71GB
97
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.71GB
98
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.71GB
99
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.71GB
100
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.71GB
101
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.71GB
102
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
103
+ Avg Max Min
104
+ cpu_mem_gb 38.666036 38.732494 38.632969
105
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
106
+ peak_gpu_mem_reserved_gb 54.365234 54.365234 54.365234
107
+ util 96.750000 100.000000 91.000000
108
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 2 : avg iteration time 51.61 seconds
109
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 2 : data loading time 0.00
110
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 2 : avg forward pass time 13.83
111
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 2 : backward pass time 11.39
112
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 2 : optimizer step time 1.16
113
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 2--------------------
114
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0070 iter count: 1.0
115
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0070 iter count: 1.0
116
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
117
+ [Apr 2, 2026 - 22:47:57 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
118
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.37GB peak=56.47GB
119
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
120
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
121
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
122
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
123
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
124
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
125
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
126
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.40GB peak=55.69GB
127
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.29GB peak=47.72GB
128
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
129
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
130
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
131
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
132
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
133
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
134
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
135
+ Avg Max Min
136
+ cpu_mem_gb 38.666379 38.733353 38.633148
137
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
138
+ peak_gpu_mem_reserved_gb 54.404297 54.404297 54.404297
139
+ util 98.000000 100.000000 94.000000
140
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 3 : avg iteration time 57.83 seconds
141
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 3 : data loading time 0.00
142
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 3 : avg forward pass time 16.98
143
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 3 : backward pass time 11.32
144
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 3 : optimizer step time 1.16
145
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 3--------------------
146
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0072 iter count: 1.0
147
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0072 iter count: 1.0
148
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
149
+ [Apr 2, 2026 - 22:48:55 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
150
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.42GB peak=56.47GB
151
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.52GB peak=46.93GB
152
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
153
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
154
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
155
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
156
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
157
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
158
+ [MEM] fake_score_update: START: alloc=14.21GB reserved=58.42GB peak=55.69GB
159
+ [MEM] fake_score_update: after student gen (no_grad): alloc=14.25GB reserved=48.27GB peak=47.72GB
160
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=15.13GB peak=47.72GB
161
+ [MEM-fwd] after block 0: alloc=15.71GB peak=47.72GB
162
+ [MEM-fwd] after block 10: alloc=21.52GB peak=47.72GB
163
+ [MEM-fwd] after block 20: alloc=26.81GB peak=47.72GB
164
+ [MEM-fwd] after block 29: alloc=31.57GB peak=47.72GB
165
+ [MEM-fwd] after head+unpatchify: alloc=32.65GB peak=47.72GB
166
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.gpu_stats:log_prof_data:45 ] GPU stats:
167
+ Avg Max Min
168
+ cpu_mem_gb 38.666442 38.733406 38.633190
169
+ peak_gpu_mem_gb 52.593685 52.593685 52.593685
170
+ peak_gpu_mem_reserved_gb 54.443359 54.443359 54.443359
171
+ util 96.500000 99.000000 92.000000
172
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:110 ] 4 : avg iteration time 57.83 seconds
173
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:123 ] 4 : data loading time 0.00
174
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:124 ] 4 : avg forward pass time 16.97
175
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:125 ] 4 : backward pass time 11.34
176
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.train_profiler:on_training_step_end:126 ] 4 : optimizer step time 1.19
177
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:log_stats:569 ] logging train stats at iteration 4--------------------
178
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_total_loss: 0.0073 iter count: 1.0
179
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_fake_score_loss: 0.0073 iter count: 1.0
180
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:get_stat:303 ] avg_gan_loss_disc: 0.0000 iter count: 1.0
181
+ [Apr 2, 2026 - 22:49:53 | INFO | fastgen.callbacks.wandb:on_training_step_end:600 ] WandB logging complete after 0.00 seconds
182
+ [MEM] fake_score_update: START: alloc=13.43GB reserved=58.46GB peak=56.47GB
183
+ [MEM] fake_score_update: after student gen (no_grad): alloc=13.47GB reserved=47.49GB peak=46.93GB
184
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=14.35GB peak=46.93GB
185
+ [MEM-fwd] after block 0: alloc=14.93GB peak=46.93GB
186
+ [MEM-fwd] after block 10: alloc=20.74GB peak=46.93GB
187
+ [MEM-fwd] after block 20: alloc=26.02GB peak=46.93GB
188
+ [MEM-fwd] after block 29: alloc=30.78GB peak=46.93GB
189
+ [MEM-fwd] after head+unpatchify: alloc=31.86GB peak=46.93GB
190
+ [MEM] student_update: START: alloc=14.32GB reserved=58.46GB peak=55.75GB
191
+ [MEM] student_update: after rollout: alloc=63.85GB reserved=66.24GB peak=65.62GB
192
+ [MEM] student_update: after perturb: alloc=63.87GB reserved=66.24GB peak=65.62GB
193
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=64.62GB peak=65.62GB
194
+ [MEM-fwd] after block 0: alloc=64.67GB peak=75.08GB
195
+ [MEM-fwd] after block 10: alloc=65.20GB peak=75.61GB
196
+ [MEM-fwd] after block 20: alloc=65.20GB peak=75.61GB
197
+ [MEM-fwd] after block 29: alloc=65.20GB peak=75.61GB
198
+ [MEM-fwd] after head+unpatchify: alloc=64.69GB peak=75.61GB
199
+ [MEM] student_update: after fake_score: alloc=63.95GB reserved=78.02GB peak=75.61GB
200
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.83GB peak=75.61GB
201
+ [MEM-fwd] after block 0: alloc=67.05GB peak=88.90GB
202
+ [MEM-fwd] after block 10: alloc=68.82GB peak=90.66GB
203
+ [MEM-fwd] after block 20: alloc=68.82GB peak=90.66GB
204
+ [MEM-fwd] after block 30: alloc=68.82GB peak=90.66GB
205
+ [MEM-fwd] after block 39: alloc=68.82GB peak=90.66GB
206
+ [MEM-fwd] after head+unpatchify: alloc=67.08GB peak=90.66GB
207
+ [MEM] student_update: after teacher: alloc=64.45GB reserved=93.79GB peak=90.66GB
208
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=66.37GB peak=90.66GB
209
+ [MEM-fwd] after block 0: alloc=67.08GB peak=90.66GB
210
+ [MEM-fwd] after block 10: alloc=68.84GB peak=90.68GB
211
+ [MEM-fwd] after block 20: alloc=68.84GB peak=90.68GB
212
+ [MEM-fwd] after block 30: alloc=68.84GB peak=90.68GB
213
+ [MEM-fwd] after block 39: alloc=68.84GB peak=90.68GB
214
+ [MEM-fwd] after head+unpatchify: alloc=67.10GB peak=90.68GB
215
+ [MEM] student_update: after CFG: alloc=64.45GB reserved=93.83GB peak=90.68GB
216
+ [MEM] student_update: after VSD loss: alloc=64.47GB reserved=93.83GB peak=90.68GB
217
+ [MEM] fake_score_update: START: alloc=12.68GB reserved=72.00GB peak=90.68GB
218
+ [MEM] fake_score_update: after student gen (no_grad): alloc=15.51GB reserved=52.62GB peak=48.98GB
219
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=16.28GB peak=48.98GB
220
+ [MEM-fwd] after block 0: alloc=16.91GB peak=48.98GB
221
+ [MEM-fwd] after block 10: alloc=22.73GB peak=48.98GB
222
+ [MEM-fwd] after block 20: alloc=28.01GB peak=48.98GB
223
+ [MEM-fwd] after block 29: alloc=32.77GB peak=48.98GB
224
+ [MEM-fwd] after head+unpatchify: alloc=33.85GB peak=48.98GB
225
+ [MEM] student_update: START: alloc=15.52GB reserved=61.80GB peak=56.95GB
226
+ [MEM] student_update: after rollout: alloc=65.05GB reserved=68.04GB peak=66.83GB
227
+ [MEM] student_update: after perturb: alloc=65.07GB reserved=68.04GB peak=66.83GB
228
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 1536]), 30 blocks): alloc=65.82GB peak=66.83GB
229
+ [MEM-fwd] after block 0: alloc=65.87GB peak=76.28GB
230
+ [MEM-fwd] after block 10: alloc=66.40GB peak=76.81GB
231
+ [MEM-fwd] after block 20: alloc=66.40GB peak=76.81GB
232
+ [MEM-fwd] after block 29: alloc=66.40GB peak=76.81GB
233
+ [MEM-fwd] after head+unpatchify: alloc=65.90GB peak=76.81GB
234
+ [MEM] student_update: after fake_score: alloc=65.15GB reserved=79.81GB peak=76.81GB
235
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.08GB peak=76.81GB
236
+ [MEM-fwd] after block 0: alloc=67.78GB peak=89.62GB
237
+ [MEM-fwd] after block 10: alloc=69.54GB peak=91.39GB
238
+ [MEM-fwd] after block 20: alloc=69.54GB peak=91.39GB
239
+ [MEM-fwd] after block 30: alloc=69.54GB peak=91.39GB
240
+ [MEM-fwd] after block 39: alloc=69.54GB peak=91.39GB
241
+ [MEM-fwd] after head+unpatchify: alloc=67.80GB peak=91.39GB
242
+ [MEM] student_update: after teacher: alloc=65.17GB reserved=94.89GB peak=91.39GB
243
+ [MEM-fwd] layer_loop_start (dim=torch.Size([8, 21504, 5120]), 40 blocks): alloc=67.10GB peak=91.39GB
244
+ [MEM-fwd] after block 0: alloc=67.80GB peak=91.39GB
245
+ [MEM-fwd] after block 10: alloc=69.57GB peak=91.41GB
246
+ [MEM-fwd] after block 20: alloc=69.57GB peak=91.41GB
247
+ [MEM-fwd] after block 30: alloc=69.57GB peak=91.41GB
248
+ [MEM-fwd] after block 39: alloc=69.57GB peak=91.41GB
249
+ [MEM-fwd] after head+unpatchify: alloc=67.83GB peak=91.41GB
250
+ [MEM] student_update: after CFG: alloc=65.17GB reserved=94.91GB peak=91.41GB
251
+ [MEM] student_update: after VSD loss: alloc=65.20GB reserved=94.91GB peak=91.41GB
252
+ Traceback (most recent call last):
253
+ File "/home/work/.local/hyunbin/FastGen/train.py", line 46, in <module>
254
+ main(config)
255
+ File "/home/work/.local/hyunbin/FastGen/train.py", line 37, in main
256
+ fastgen_trainer.run(model)
257
+ File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 194, in run
258
+ loss_map, outputs = self.train_step(model_ddp, model, data, iter_cur, grad_accum_iter)
259
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260
+ File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 331, in train_step
261
+ model.grad_scaler.scale(loss_map["total_loss"] / grad_accum_rounds).backward()
262
+ File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/_tensor.py", line 630, in backward
263
+ torch.autograd.backward(
264
+ File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/__init__.py", line 364, in backward
265
+ _engine_run_backward(
266
+ File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/graph.py", line 865, in _engine_run_backward
267
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
268
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
269
+ File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 1177, in unpack_hook
270
+ frame.check_recomputed_tensors_match(gid)
271
+ File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 882, in check_recomputed_tensors_match
272
+ raise CheckpointError(
273
+ torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: A different number of tensors was saved during the original forward and recomputation.
274
+ Number of tensors saved during forward: 94
275
+ Number of tensors saved during recomputation: 80.
276
+
277
+ Tip: To see a more detailed error message, either pass `debug=True` to
278
+ `torch.utils.checkpoint.checkpoint(...)` or wrap the code block
279
+ with `with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):` to
280
+ enable checkpoint‑debug mode globally.
281
+
282
+ [rank0]: Traceback (most recent call last):
283
+ [rank0]: File "/home/work/.local/hyunbin/FastGen/train.py", line 46, in <module>
284
+ [rank0]: main(config)
285
+ [rank0]: File "/home/work/.local/hyunbin/FastGen/train.py", line 37, in main
286
+ [rank0]: fastgen_trainer.run(model)
287
+ [rank0]: File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 194, in run
288
+ [rank0]: loss_map, outputs = self.train_step(model_ddp, model, data, iter_cur, grad_accum_iter)
289
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
290
+ [rank0]: File "/home/work/.local/hyunbin/FastGen/fastgen/trainer.py", line 331, in train_step
291
+ [rank0]: model.grad_scaler.scale(loss_map["total_loss"] / grad_accum_rounds).backward()
292
+ [rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/_tensor.py", line 630, in backward
293
+ [rank0]: torch.autograd.backward(
294
+ [rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/__init__.py", line 364, in backward
295
+ [rank0]: _engine_run_backward(
296
+ [rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/autograd/graph.py", line 865, in _engine_run_backward
297
+ [rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
298
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
299
+ [rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 1177, in unpack_hook
300
+ [rank0]: frame.check_recomputed_tensors_match(gid)
301
+ [rank0]: File "/home/work/.local/miniconda3/envs/hb_fastgen/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 882, in check_recomputed_tensors_match
302
+ [rank0]: raise CheckpointError(
303
+ [rank0]: torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: A different number of tensors was saved during the original forward and recomputation.
304
+ [rank0]: Number of tensors saved during forward: 94
305
+ [rank0]: Number of tensors saved during recomputation: 80.
306
+
307
+ [rank0]: Tip: To see a more detailed error message, either pass `debug=True` to
308
+ [rank0]: `torch.utils.checkpoint.checkpoint(...)` or wrap the code block
309
+ [rank0]: with `with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):` to
310
+ [rank0]: enable checkpoint‑debug mode globally.
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/requirements.txt ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastgen==0.1.0
2
+ nvitop==1.6.1
3
+ ftfy==6.3.1
4
+ braceexpand==0.1.7
5
+ antlr4-python3-runtime==4.9.3
6
+ webdataset==1.0.2
7
+ sentry-sdk==2.53.0
8
+ rdkit==2025.9.5
9
+ python-dotenv==1.2.1
10
+ proglog==0.1.12
11
+ omegaconf==2.3.0
12
+ narwhals==2.17.0
13
+ loguru==0.7.3
14
+ imageio-ffmpeg==0.6.0
15
+ plotly==6.5.2
16
+ moviepy==2.2.1
17
+ hydra-core==1.3.2
18
+ wandb==0.25.0
19
+ fastgen==0.1.0
20
+ packaging==25.0
21
+ setuptools==80.10.2
22
+ wheel==0.46.3
23
+ pip==26.0.1
24
+ webencodings==0.5.1
25
+ pure_eval==0.2.3
26
+ ptyprocess==0.7.0
27
+ nvidia-ml-py==13.590.48
28
+ nvidia-cusparselt-cu12==0.7.1
29
+ mpmath==1.3.0
30
+ fastjsonschema==2.21.2
31
+ zipp==3.23.0
32
+ xyzservices==2025.11.0
33
+ widgetsnbextension==4.0.15
34
+ websocket-client==1.9.0
35
+ webcolors==25.10.0
36
+ wcwidth==0.6.0
37
+ urllib3==2.6.3
38
+ uri-template==1.3.0
39
+ tzdata==2025.3
40
+ typing_extensions==4.15.0
41
+ triton==3.6.0
42
+ traitlets==5.14.3
43
+ tqdm==4.67.3
44
+ tornado==6.5.5
45
+ tinycss2==1.4.0
46
+ sympy==1.14.0
47
+ soupsieve==2.8.3
48
+ smmap==5.0.3
49
+ six==1.16.0
50
+ sentencepiece==0.2.1
51
+ Send2Trash==2.1.0
52
+ safetensors==0.7.0
53
+ rpds-py==0.30.0
54
+ rfc3986-validator==0.1.1
55
+ regex==2026.2.28
56
+ pyzmq==27.1.0
57
+ PyYAML==6.0.3
58
+ python-json-logger==4.0.0
59
+ Pygments==2.19.2
60
+ pycparser==3.0
61
+ psutil==7.2.2
62
+ protobuf==4.24.4
63
+ prometheus_client==0.24.1
64
+ platformdirs==4.9.4
65
+ pillow==11.3.0
66
+ pexpect==4.9.0
67
+ parso==0.8.6
68
+ pandocfilters==1.5.1
69
+ nvidia-nvtx-cu12==12.8.90
70
+ nvidia-nvshmem-cu12==3.4.5
71
+ nvidia-nvjitlink-cu12==12.8.93
72
+ nvidia-nccl-cu12==2.27.5
73
+ nvidia-curand-cu12==10.3.9.90
74
+ nvidia-cufile-cu12==1.13.1.3
75
+ nvidia-cuda-runtime-cu12==12.8.90
76
+ nvidia-cuda-nvrtc-cu12==12.8.93
77
+ nvidia-cuda-cupti-cu12==12.8.90
78
+ nvidia-cublas-cu12==12.8.4.1
79
+ numpy==1.26.4
80
+ networkx==3.6.1
81
+ nest-asyncio==1.6.0
82
+ mistune==3.2.0
83
+ MarkupSafe==3.0.3
84
+ lark==1.3.1
85
+ jupyterlab_widgets==3.0.16
86
+ jupyterlab_pygments==0.3.0
87
+ jsonpointer==3.0.0
88
+ json5==0.13.0
89
+ jmespath==1.1.0
90
+ idna==3.11
91
+ hf-xet==1.4.2
92
+ h11==0.16.0
93
+ fsspec==2026.2.0
94
+ fqdn==1.5.1
95
+ filelock==3.25.2
96
+ executing==2.2.1
97
+ einops==0.8.2
98
+ defusedxml==0.7.1
99
+ decorator==5.2.1
100
+ debugpy==1.8.20
101
+ cuda-pathfinder==1.4.2
102
+ comm==0.2.3
103
+ click==8.3.1
104
+ charset-normalizer==3.4.5
105
+ certifi==2026.2.25
106
+ bleach==6.3.0
107
+ babel==2.18.0
108
+ av==17.0.0
109
+ attrs==25.4.0
110
+ async-lru==2.2.0
111
+ asttokens==3.0.1
112
+ annotated-types==0.7.0
113
+ typing-inspection==0.4.2
114
+ terminado==0.18.1
115
+ stack-data==0.6.3
116
+ scipy==1.17.1
117
+ rfc3987-syntax==1.1.0
118
+ rfc3339-validator==0.1.4
119
+ requests==2.32.5
120
+ referencing==0.37.0
121
+ python-dateutil==2.9.0.post0
122
+ pydantic_core==2.41.5
123
+ prompt_toolkit==3.0.52
124
+ opencv-python-headless==4.11.0.86
125
+ nvidia-cusparse-cu12==12.5.8.93
126
+ nvidia-cufft-cu12==11.3.3.83
127
+ nvidia-cudnn-cu12==9.10.2.21
128
+ matplotlib-inline==0.2.1
129
+ jupyter_core==5.9.1
130
+ Jinja2==3.1.6
131
+ jedi==0.19.2
132
+ ipython_pygments_lexers==1.1.1
133
+ importlib_metadata==8.7.1
134
+ ImageIO==2.37.3
135
+ httpcore==1.0.9
136
+ gitdb==4.0.12
137
+ cuda-bindings==12.9.4
138
+ contourpy==1.3.3
139
+ cffi==2.0.0
140
+ beautifulsoup4==4.14.3
141
+ anyio==4.12.1
142
+ soundfile==0.13.1
143
+ pydantic==2.12.5
144
+ nvidia-cusolver-cu12==11.7.3.90
145
+ jupyter_server_terminals==0.5.4
146
+ jupyter_client==8.8.0
147
+ jsonschema-specifications==2025.9.1
148
+ ipython==9.11.0
149
+ httpx==0.28.1
150
+ GitPython==3.1.46
151
+ botocore==1.42.68
152
+ bokeh==3.9.0
153
+ arrow==1.4.0
154
+ argon2-cffi-bindings==25.1.0
155
+ torch==2.10.0
156
+ s3transfer==0.16.0
157
+ jsonschema==4.26.0
158
+ isoduration==20.11.0
159
+ ipywidgets==8.1.8
160
+ ipykernel==7.2.0
161
+ argon2-cffi==25.1.0
162
+ torchvision==0.25.0
163
+ nbformat==5.10.4
164
+ jupyter-console==6.6.3
165
+ boto3==1.42.68
166
+ accelerate==1.13.0
167
+ nbclient==0.10.4
168
+ jupyter-events==0.12.0
169
+ nbconvert==7.17.0
170
+ jupyter_server==2.17.0
171
+ notebook_shim==0.2.4
172
+ jupyterlab_server==2.28.0
173
+ jupyter-lsp==2.3.0
174
+ jupyterlab==4.5.6
175
+ notebook==7.5.5
176
+ jupyter==1.1.1
177
+ fastgen==0.1.0
178
+ pandas==3.0.1
179
+ shellingham==1.5.4
180
+ mdurl==0.1.2
181
+ annotated-doc==0.0.4
182
+ markdown-it-py==4.0.0
183
+ rich==14.3.3
184
+ typer==0.24.1
185
+ huggingface_hub==1.7.1
186
+ timm==1.0.25
187
+ tokenizers==0.22.2
188
+ diffusers==0.37.0
189
+ transformers==5.3.0
190
+ peft==0.18.1
191
+ easydict==1.13
192
+ lmdb==2.2.0
193
+ threadpoolctl==3.6.0
194
+ soxr==1.0.0
195
+ msgpack==1.1.2
196
+ llvmlite==0.47.0
197
+ lazy-loader==0.5
198
+ joblib==1.5.3
199
+ audioread==3.1.0
200
+ scikit-learn==1.8.0
201
+ pooch==1.9.0
202
+ numba==0.65.0
203
+ librosa==0.11.0
204
+ simsimd==6.5.16
205
+ flatbuffers==25.12.19
206
+ tifffile==2026.3.3
207
+ stringzilla==4.6.0
208
+ pyparsing==3.3.2
209
+ prettytable==3.17.0
210
+ onnx==1.17.0
211
+ kiwisolver==1.5.0
212
+ fonttools==4.62.1
213
+ Cython==3.2.4
214
+ cycler==0.12.1
215
+ scikit-image==0.26.0
216
+ onnxruntime==1.24.4
217
+ matplotlib==3.10.8
218
+ albucore==0.0.24
219
+ albumentations==2.0.8
220
+ insightface==0.7.3
221
+ kornia_rs==0.1.10
222
+ kornia==0.8.2
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-metadata.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-151-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.12",
4
+ "startedAt": "2026-04-02T13:45:31.145420Z",
5
+ "args": [
6
+ "--config=fastgen/configs/experiments/OmniAvatar/config_sf.py",
7
+ "-",
8
+ "trainer.resume=False",
9
+ "log_config.name=sf_combined_step_test",
10
+ "log_config.project=OmniAvatar-FastGen",
11
+ "trainer.max_iter=20"
12
+ ],
13
+ "program": "/home/work/.local/hyunbin/FastGen/train.py",
14
+ "codePath": "train.py",
15
+ "codePathLocal": "train.py",
16
+ "git": {
17
+ "remote": "https://paulcho98:@github.com/paulcho98/FastGen.git",
18
+ "commit": "04de80beaf50f849c12a55a5d8358d94530b7bb5"
19
+ },
20
+ "email": "paul.hyunbin@gmail.com",
21
+ "root": "/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test",
22
+ "host": "main1",
23
+ "executable": "/home/work/.local/miniconda3/envs/hb_fastgen/bin/python3.12",
24
+ "cpu_count": 112,
25
+ "cpu_count_logical": 224,
26
+ "gpu": "NVIDIA H200",
27
+ "gpu_count": 4,
28
+ "disk": {
29
+ "/": {
30
+ "total": "1356758433792",
31
+ "used": "257963536384"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "2163961778176"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA H200",
40
+ "memoryTotal": "150754820096",
41
+ "cudaCores": 16896,
42
+ "architecture": "Hopper",
43
+ "uuid": "GPU-4685d4b3-5cf9-2766-43d3-b9615a684b7c"
44
+ },
45
+ {
46
+ "name": "NVIDIA H200",
47
+ "memoryTotal": "150754820096",
48
+ "cudaCores": 16896,
49
+ "architecture": "Hopper",
50
+ "uuid": "GPU-ec888a66-4b6f-b8de-b34b-249efb9ad262"
51
+ },
52
+ {
53
+ "name": "NVIDIA H200",
54
+ "memoryTotal": "150754820096",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper",
57
+ "uuid": "GPU-9c1e1773-d710-06c9-7db7-1b54e9fc3790"
58
+ },
59
+ {
60
+ "name": "NVIDIA H200",
61
+ "memoryTotal": "150754820096",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper",
64
+ "uuid": "GPU-2b1017dc-2958-a946-16d2-2c29da6d18b0"
65
+ }
66
+ ],
67
+ "cudaVersion": "12.9",
68
+ "writerId": "mf7qlulwtbceq6fkw52thj4sgf25e3dz"
69
+ }
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"fake_score/local_total_params":[399300904,399021352,399021352,399021352],"optimizer/non_finite_grads_count (model_key net)":0,"_wandb":{"runtime":445},"optimizer/grad_norm (model_key net)":0,"_runtime":445.189429909,"fake_score/total_params":1596364960,"profiler/avg_forward_pass_time":16.967717550694942,"model/trainable_params":1596364960,"model/total_params":17311830496,"train/fake_score_loss":0.0072784423828125,"_timestamp":1.7751377931214402e+09,"profiler/avg_iteration_time":57.83046340942383,"profiler/data_loading_time":0.000791529193520546,"fake_score/local_trainable_params":[43745280,43745280,43745280,43745280],"profiler/backward_pass_time":11.342406308278441,"net/local_total_params":[376675360,376398880,376398880,376398880],"train/total_loss":0.0072784423828125,"model/local_total_params":[4350425968,4348630896,4348630896,4348630896],"_step":4,"optimizer/lr_fake_score":2e-06,"net/trainable_params":1421383840,"train/gan_loss_disc":0,"profiler/optimizer_step_time":1.187411269173026,"net/total_params":1421383840,"optimizer/lr_net":2e-06,"model/local_trainable_params":[420420640,420144160,420144160,420144160],"fake_score/trainable_params":174981120,"net/local_trainable_params":[376675360,376398880,376398880,376398880],"optimizer/iteration":4}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T22:45:31.195756192+09:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphkxult0z/port-792541.txt","pid":792541,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-02T22:45:31.196239435+09:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":792541}
3
+ {"time":"2026-04-02T22:45:31.196228389+09:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-792541-806747-3978962853/socket","Net":"unix"}}
4
+ {"time":"2026-04-02T22:45:31.38167949+09:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-02T22:45:31.396742512+09:00","level":"INFO","msg":"handleInformInit: received","streamId":"spcd04xe","id":"1(@)"}
6
+ {"time":"2026-04-02T22:45:31.746828856+09:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"spcd04xe","id":"1(@)"}
7
+ {"time":"2026-04-02T22:45:37.997230573+09:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"ve1uhd81uimn"}
8
+ {"time":"2026-04-02T22:52:57.910992536+09:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
9
+ {"time":"2026-04-02T22:52:57.911042628+09:00","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2026-04-02T22:52:57.911035318+09:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
11
+ {"time":"2026-04-02T22:52:57.911115888+09:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
12
+ {"time":"2026-04-02T22:52:57.911110682+09:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-792541-806747-3978962853/socket","Net":"unix"}}
13
+ {"time":"2026-04-02T22:52:58.870224925+09:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
14
+ {"time":"2026-04-02T22:52:58.87024994+09:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2026-04-02T22:52:58.870261844+09:00","level":"INFO","msg":"server is closed"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-02T22:45:31.396833745+09:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-04-02T22:45:31.746696076+09:00","level":"INFO","msg":"stream: created new stream","id":"spcd04xe"}
3
+ {"time":"2026-04-02T22:45:31.746744779+09:00","level":"INFO","msg":"handler: started","stream_id":"spcd04xe"}
4
+ {"time":"2026-04-02T22:45:31.746822827+09:00","level":"INFO","msg":"stream: started","id":"spcd04xe"}
5
+ {"time":"2026-04-02T22:45:31.746841154+09:00","level":"INFO","msg":"sender: started","stream_id":"spcd04xe"}
6
+ {"time":"2026-04-02T22:45:31.74684523+09:00","level":"INFO","msg":"writer: started","stream_id":"spcd04xe"}
7
+ {"time":"2026-04-02T22:52:57.91103952+09:00","level":"INFO","msg":"stream: closing","id":"spcd04xe"}
8
+ {"time":"2026-04-02T22:52:58.486184439+09:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-04-02T22:52:58.869727008+09:00","level":"INFO","msg":"handler: closed","stream_id":"spcd04xe"}
10
+ {"time":"2026-04-02T22:52:58.869816672+09:00","level":"INFO","msg":"sender: closed","stream_id":"spcd04xe"}
11
+ {"time":"2026-04-02T22:52:58.869827326+09:00","level":"INFO","msg":"stream: closed","id":"spcd04xe"}
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Configure stats pid to 792541
3
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug.log
5
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/logs/debug-internal.log
6
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'log_config': {'project': 'OmniAvatar-FastGen', 'group': 'omniavatar_sf', 'name': 'sf_combined_step_test', 'wandb_mode': 'online', 'wandb_entity': 'paulhcho', 'wandb_credential': './credentials/wandb_api.txt'}, 'trainer': {'cudnn': {'deterministic': 'False', 'benchmark': 'True'}, 'checkpointer': {'save_dir': '/tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/checkpoints', 'use_s3': 'False', 's3_container': 's3://checkpoints/fastgen', 's3_credential': './credentials/s3.json', 'pretrained_ckpt_path': '/home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth', 'pretrained_ckpt_key_map': {'net': 'net'}}, 'callbacks': {'grad_clip': {'grad_norm': '10.0', 'model_key': 'net', '_target_': "<class 'fastgen.callbacks.grad_clip.GradClipCallback'>"}, 'gpu_stats': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>"}, 'train_profiler': {'every_n': '100', '_target_': "<class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>"}, 'param_count': {'_target_': "<class 'fastgen.callbacks.param_count.ParamCountCallback'>"}, 'ema': {'type': 'constant', 'beta': '0.9999', 'gamma': '16.97', 'ema_halflife_kimg': '500', 'ema_rampup_ratio': '0.05', 'start_iter': '0', '_target_': "<class 'fastgen.callbacks.ema.EMACallback'>"}, 'wandb': {'sample_logging_iter': '100', '_target_': "<class 'fastgen.callbacks.wandb.WandbCallback'>", 'fps': '25'}}, 'save_ckpt_iter': '100', 'validation_iter': '100', 'skip_initial_validation': 'True', 'logging_iter': '1', 'max_iter': '20', 'visualize_teacher': 'False', 'seed': '0', 'val_seed': None, 'resume': 'False', 'ddp': 'False', 'fsdp': 'True', 'tf32_enabled': 'True', 'grad_accum_rounds': '2', 'batch_size_global': None, 'offload_module_in_decoding': 'False', 'fsdp_cpu_offload': 'False', 'fsdp_min_num_params': '10000000', 'fsdp_sharding_group_size': None, 'global_vars': None, 'global_vars_val': [None], 'augment_pipe': None}, 'dataloader_train': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_path.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '8', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', '_target_': "<class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>"}, 'dataloader_val': {'data_list_path': '/home/work/stableavatar_data/v2v_training_data/video_square_val10.txt', 'latentsync_mask_path': '/home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png', 'batch_size': '1', 'num_workers': '2', 'neg_text_emb_path': '/home/work/stableavatar_data/neg_text_emb.pt', 'use_ref_sequence': 'True', 'load_ode_path': 'False', '_target_': '<function create_omniavatar_dataloader at 0x7ff2f56a3ce0>'}, 'eval': {'num_samples': '50000', 'save_images': 'False', 'min_ckpt': '0', 'max_ckpt': '100000000', 'samples_dir': 'samples'}, 'model': {'net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'chunk_size': '3', 'total_num_frames': '21', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>"}, 'teacher': {'model_size': '14B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_phase2/step-10500.pt', 'merge_lora': 'True', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'fake_score_net': {'model_size': '1.3B', 'in_dim': '65', 'mode': 'v2v', 'use_audio': 'True', 'audio_hidden_size': '32', 'base_model_paths': '/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors', 'omniavatar_ckpt_path': '/home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt', 'merge_lora': 'False', 'net_pred_type': 'flow', 'schedule_type': 'rf', '_target_': "<class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>"}, 'guidance_scale': '4.5', 'skip_layers': None, 'net_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'net_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'sample_t_cfg': {'time_dist_type': 'shifted', 'train_p_mean': '-1.1', 'train_p_std': '2.0', 'shift': '5.0', 'min_t': '0.001', 'max_t': '0.999', 't_list': ['0.999', '0.937', '0.833', '0.624', '0.0'], 'log_t_df': '0.01'}, 'input_shape': ['16', '21', '64', '64'], 'device': 'cuda', 'grad_scaler_enabled': 'False', 'grad_scaler_init_scale': '65536.0', 'grad_scaler_growth_interval': '2000', 'pretrained_model_path': '', 'pretrained_student_net_path': '', 'load_student_weights': 'False', 'enable_preprocessors': 'True', 'use_ema': 'False', 'student_sample_steps': '4', 'student_sample_type': 'sde', 'fsdp_meta_init': 'False', 'add_teacher_to_fsdp_dict': 'True', 'ddp_find_unused_parameters': 'True', 'precision': 'bfloat16', 'precision_amp': None, 'precision_amp_infer': None, 'precision_amp_enc': None, 'precision_fsdp': 'bfloat16', 'fake_score_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '2e-06', 'weight_decay': '0.01', 'betas': ['0.0', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'fake_score_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'discriminator': {'feature_indices': '{0, 1, 2}', 'all_res': ['32', '16', '8'], 'in_channels': '256', '_target_': "<class 'fastgen.networks.discriminators.Discriminator_EDM'>"}, 'discriminator_optimizer': {'model': None, 'optim_type': 'adamw', 'lr': '0.0001', 'weight_decay': '0.01', 'betas': ['0.9', '0.999'], 'eps': '1e-08', 'fused': 'False', '_target_': '<function get_optimizer at 0x7ff2f5916660>'}, 'discriminator_scheduler': {'warm_up_steps': ['0'], 'cycle_lengths': ['10000000000'], 'f_start': ['1e-06'], 'f_max': ['1.0'], 'f_min': ['1.0'], '_target_': "<class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>"}, 'student_update_freq': '5', 'gan_loss_weight_gen': '0', 'gan_use_same_t_noise': 'False', 'fake_score_pred_type': 'x0', 'gan_r1_reg_weight': '0.0', 'gan_r1_reg_alpha': '0.1', 'enable_gradient_in_rollout': 'True', 'start_gradient_frame': '0', 'same_step_across_blocks': 'True', 'last_step_only': 'False', 'context_noise': '0.0', 'fake_score': None}, 'model_class': {'config': None, '_target_': "<class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>"}, '_wandb': {}}
9
+ 2026-04-02 22:45:31,146 INFO MainThread:792541 [wandb_init.py:init():892] starting backend
10
+ 2026-04-02 22:45:31,381 INFO MainThread:792541 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-02 22:45:31,394 INFO MainThread:792541 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-02 22:45:31,398 INFO MainThread:792541 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-02 22:45:31,413 INFO MainThread:792541 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-02 22:45:32,719 INFO MainThread:792541 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-04-02 22:45:32,993 INFO MainThread:792541 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-04-02 22:45:32,994 INFO MainThread:792541 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-04-02 22:45:32,998 INFO MainThread:792541 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [service_client.py:_forward_responses():134] Reached EOF.
21
+ 2026-04-02 22:52:57,911 INFO wandb-AsyncioManager-main:792541 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb/run-20260402_224531-spcd04xe/run-spcd04xe.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c85a6826c3e4463f05f583260010422503fd8df994d4cb8222a7140a5f63297a
3
+ size 132588
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_combined_step_test/wandb_id.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ spcd04xe
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_train:
2
+ _target_: <class 'fastgen.datasets.omniavatar_dataloader.OmniAvatarDataLoader'>
3
+ batch_size: '8'
4
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_path.txt
5
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
6
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
7
+ num_workers: '2'
8
+ use_ref_sequence: 'True'
9
+ dataloader_val:
10
+ _target_: <function create_omniavatar_dataloader at 0x7feb4c60be20>
11
+ batch_size: '1'
12
+ data_list_path: /home/work/stableavatar_data/v2v_training_data/video_square_val10.txt
13
+ latentsync_mask_path: /home/work/.local/Self-Forcing_LipSync_StableAvatar/diffsynth/utils/mask.png
14
+ load_ode_path: 'False'
15
+ neg_text_emb_path: /home/work/stableavatar_data/neg_text_emb.pt
16
+ num_workers: '2'
17
+ use_ref_sequence: 'True'
18
+ eval:
19
+ max_ckpt: '100000000'
20
+ min_ckpt: '0'
21
+ num_samples: '50000'
22
+ samples_dir: samples
23
+ save_images: 'False'
24
+ log_config:
25
+ group: omniavatar_sf
26
+ name: sf_no_reqgrad_toggle
27
+ project: OmniAvatar-FastGen
28
+ wandb_credential: ./credentials/wandb_api.txt
29
+ wandb_entity: paulhcho
30
+ wandb_mode: disabled
31
+ model:
32
+ add_teacher_to_fsdp_dict: 'True'
33
+ context_noise: '0.0'
34
+ ddp_find_unused_parameters: 'True'
35
+ device: cuda
36
+ discriminator:
37
+ _target_: <class 'fastgen.networks.discriminators.Discriminator_EDM'>
38
+ all_res:
39
+ - '32'
40
+ - '16'
41
+ - '8'
42
+ feature_indices: '{0, 1, 2}'
43
+ in_channels: '256'
44
+ discriminator_optimizer:
45
+ _target_: <function get_optimizer at 0x7feb4c86a660>
46
+ betas:
47
+ - '0.9'
48
+ - '0.999'
49
+ eps: 1e-08
50
+ fused: 'False'
51
+ lr: '0.0001'
52
+ model: null
53
+ optim_type: adamw
54
+ weight_decay: '0.01'
55
+ discriminator_scheduler:
56
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
57
+ cycle_lengths:
58
+ - '10000000000'
59
+ f_max:
60
+ - '1.0'
61
+ f_min:
62
+ - '1.0'
63
+ f_start:
64
+ - 1e-06
65
+ warm_up_steps:
66
+ - '0'
67
+ enable_gradient_in_rollout: 'True'
68
+ enable_preprocessors: 'True'
69
+ fake_score: null
70
+ fake_score_net:
71
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
72
+ audio_hidden_size: '32'
73
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
74
+ in_dim: '65'
75
+ merge_lora: 'False'
76
+ mode: v2v
77
+ model_size: 1.3B
78
+ net_pred_type: flow
79
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
80
+ schedule_type: rf
81
+ use_audio: 'True'
82
+ fake_score_optimizer:
83
+ _target_: <function get_optimizer at 0x7feb4c86a660>
84
+ betas:
85
+ - '0.0'
86
+ - '0.999'
87
+ eps: 1e-08
88
+ fused: 'False'
89
+ lr: 2e-06
90
+ model: null
91
+ optim_type: adamw
92
+ weight_decay: '0.01'
93
+ fake_score_pred_type: x0
94
+ fake_score_scheduler:
95
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
96
+ cycle_lengths:
97
+ - '10000000000'
98
+ f_max:
99
+ - '1.0'
100
+ f_min:
101
+ - '1.0'
102
+ f_start:
103
+ - 1e-06
104
+ warm_up_steps:
105
+ - '0'
106
+ fsdp_meta_init: 'False'
107
+ gan_loss_weight_gen: '0'
108
+ gan_r1_reg_alpha: '0.1'
109
+ gan_r1_reg_weight: '0.0'
110
+ gan_use_same_t_noise: 'False'
111
+ grad_scaler_enabled: 'False'
112
+ grad_scaler_growth_interval: '2000'
113
+ grad_scaler_init_scale: '65536.0'
114
+ guidance_scale: '4.5'
115
+ input_shape:
116
+ - '16'
117
+ - '21'
118
+ - '64'
119
+ - '64'
120
+ last_step_only: 'False'
121
+ load_student_weights: 'False'
122
+ net:
123
+ _target_: <class 'fastgen.networks.OmniAvatar.network_causal.CausalOmniAvatarWan'>
124
+ audio_hidden_size: '32'
125
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
126
+ chunk_size: '3'
127
+ in_dim: '65'
128
+ mode: v2v
129
+ model_size: 1.3B
130
+ net_pred_type: flow
131
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_1.3B_phase2/step-19500.pt
132
+ schedule_type: rf
133
+ total_num_frames: '21'
134
+ use_audio: 'True'
135
+ net_optimizer:
136
+ _target_: <function get_optimizer at 0x7feb4c86a660>
137
+ betas:
138
+ - '0.0'
139
+ - '0.999'
140
+ eps: 1e-08
141
+ fused: 'False'
142
+ lr: 2e-06
143
+ model: null
144
+ optim_type: adamw
145
+ weight_decay: '0.01'
146
+ net_scheduler:
147
+ _target_: <class 'fastgen.utils.lr_scheduler.LambdaLinearScheduler'>
148
+ cycle_lengths:
149
+ - '10000000000'
150
+ f_max:
151
+ - '1.0'
152
+ f_min:
153
+ - '1.0'
154
+ f_start:
155
+ - 1e-06
156
+ warm_up_steps:
157
+ - '0'
158
+ precision: bfloat16
159
+ precision_amp: null
160
+ precision_amp_enc: null
161
+ precision_amp_infer: null
162
+ precision_fsdp: bfloat16
163
+ pretrained_model_path: ''
164
+ pretrained_student_net_path: ''
165
+ same_step_across_blocks: 'True'
166
+ sample_t_cfg:
167
+ log_t_df: '0.01'
168
+ max_t: '0.999'
169
+ min_t: '0.001'
170
+ shift: '5.0'
171
+ t_list:
172
+ - '0.999'
173
+ - '0.937'
174
+ - '0.833'
175
+ - '0.624'
176
+ - '0.0'
177
+ time_dist_type: shifted
178
+ train_p_mean: '-1.1'
179
+ train_p_std: '2.0'
180
+ skip_layers: null
181
+ start_gradient_frame: '0'
182
+ student_sample_steps: '4'
183
+ student_sample_type: sde
184
+ student_update_freq: '5'
185
+ teacher:
186
+ _target_: <class 'fastgen.networks.OmniAvatar.network.OmniAvatarWan'>
187
+ audio_hidden_size: '32'
188
+ base_model_paths: /home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,/home/work/.local/OmniAvatar/pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
189
+ in_dim: '65'
190
+ merge_lora: 'True'
191
+ mode: v2v
192
+ model_size: 14B
193
+ net_pred_type: flow
194
+ omniavatar_ckpt_path: /home/work/output_omniavatar_v2v_phase2/step-10500.pt
195
+ schedule_type: rf
196
+ use_audio: 'True'
197
+ use_ema: 'False'
198
+ model_class:
199
+ _target_: <class 'fastgen.methods.omniavatar_self_forcing.OmniAvatarSelfForcingModel'>
200
+ config: null
201
+ trainer:
202
+ augment_pipe: null
203
+ batch_size_global: null
204
+ callbacks:
205
+ ema:
206
+ _target_: <class 'fastgen.callbacks.ema.EMACallback'>
207
+ beta: '0.9999'
208
+ ema_halflife_kimg: '500'
209
+ ema_rampup_ratio: '0.05'
210
+ gamma: '16.97'
211
+ start_iter: '0'
212
+ type: constant
213
+ gpu_stats:
214
+ _target_: <class 'fastgen.callbacks.gpu_stats.GPUStatsCallback'>
215
+ every_n: '100'
216
+ grad_clip:
217
+ _target_: <class 'fastgen.callbacks.grad_clip.GradClipCallback'>
218
+ grad_norm: '10.0'
219
+ model_key: net
220
+ param_count:
221
+ _target_: <class 'fastgen.callbacks.param_count.ParamCountCallback'>
222
+ train_profiler:
223
+ _target_: <class 'fastgen.callbacks.train_profiler.TrainProfilerCallback'>
224
+ every_n: '100'
225
+ wandb:
226
+ _target_: <class 'fastgen.callbacks.wandb.WandbCallback'>
227
+ fps: '25'
228
+ sample_logging_iter: '100'
229
+ checkpointer:
230
+ pretrained_ckpt_key_map:
231
+ net: net
232
+ pretrained_ckpt_path: /home/work/.local/hyunbin/FastGen/FASTGEN_OUTPUT/OmniAvatar-FastGen/omniavatar_df/df_4gpu_bs16_lr1e5_10000iter_shift_5/checkpoints/0005000.pth
233
+ s3_container: s3://checkpoints/fastgen
234
+ s3_credential: ./credentials/s3.json
235
+ save_dir: /tmp/FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_no_reqgrad_toggle/checkpoints
236
+ use_s3: 'False'
237
+ cudnn:
238
+ benchmark: 'True'
239
+ deterministic: 'False'
240
+ ddp: 'False'
241
+ fsdp: 'True'
242
+ fsdp_cpu_offload: 'False'
243
+ fsdp_min_num_params: '10000000'
244
+ fsdp_sharding_group_size: null
245
+ global_vars: null
246
+ global_vars_val:
247
+ - null
248
+ grad_accum_rounds: '2'
249
+ logging_iter: '1'
250
+ max_iter: '10'
251
+ offload_module_in_decoding: 'False'
252
+ resume: 'False'
253
+ save_ckpt_iter: '100'
254
+ seed: '0'
255
+ skip_initial_validation: 'True'
256
+ tf32_enabled: 'True'
257
+ val_seed: null
258
+ validation_iter: '100'
259
+ visualize_teacher: 'False'
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68cfb6be85ca8bb4cb2c99e580d579759b315332fe04ff9583e8a17503710b70
3
+ size 614328
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d137502c435eeaa445c868ec414ada748effb592f340238578f82a40be7a9c
3
+ size 1424446005
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60a72b6714edac135314634cae333779c2cdc040ac2bae1b34bfe521981579d
3
+ size 1401881227
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c33b96f7c9d96d1b61c2817be29f21a1fffbc41eef2ba6d7e4d7d4bb3eb2543
3
+ size 1406759189
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.net_model/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16e8089a70fb63a6ef5e83f93b2bc5ef002d78b3e8f1c61ae1fd9386e18db0b
3
+ size 1401210558
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c9e7bf7b483afea9b5f5fc9f6d1a8368eea2246e5e5404c08b9fba8acdc0064
3
+ size 1901
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa773eb266887bcf43277c27dddf54d6f73581fb8a49aefd509ff3fd7699d8b
3
+ size 614328
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f79a6720f3855aca52a768cc9b64766d16fbc6af95c459f3d236fee4056a8b2
3
+ size 1424446005
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e4b4d3a27464533ae9b16fe7eb25319df4483379605ce23676ce66ee538e9c
3
+ size 1401881227
FASTGEN_SF_OUTPUT/OmniAvatar-FastGen/omniavatar_sf/sf_sink1_window7_redmd_syncc_beta0p25/checkpoints/0000200.net_model/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c6fafdbb008d602689cc51bdb7811e31fdab406e70859f6f647970016bfde2a
3
+ size 1406759189